// https://computing.llnl.gov/tutorials/openMP/
// g++ -fopenmp -Wall -Wextra -pedantic -std=c++11  pi.cpp 
#include <iostream>
#include <iomanip>
#include <chrono>
#include <omp.h> // OMP functions

static long num_steps = 100000;
double step;

int main ()
{ 
    double sum = 0.0;
    double step = 1.0/static_cast<double>(num_steps);

    // single threaded
    std::cout << "Single threaded\n";
    auto begin = std::chrono::high_resolution_clock::now();
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        sum = sum + 4.0/(1.0+x*x);
    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
    std::cout << std::setw(9) << duration << " ns: " << std::endl;
    std::cout << "Sum = " << sum/num_steps << std::endl;

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel incorrect - forgot declare as shared
    std::cout << "Multi threaded - incorrect\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        sum = sum + 4.0/(1.0+x*x);
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }


    //////////////////////////////////////////////////////////////////////////////// 
    // parallel incorrect - data race
    std::cout << "Multi threaded - incorrect\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum) 
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        sum = sum + 4.0/(1.0+x*x);
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct inefficent
    std::cout << "Multi threaded - correct, slow (critical)\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum)
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        double y = 4.0/(1.0+x*x);
#pragma omp critical
        sum = sum + y;
    }
    end = std::chrono::high_resolution_clock::now();
    auto duration3 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
    std::cout << std::setw(9) << duration3 << " ns: " << std::endl;
    std::cout << "d3/d = " << std::setw(9) << static_cast<double>(duration3)/duration << std::endl;
    std::cout << "Sum = " << sum/num_steps << std::endl;

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct inefficent but better
    std::cout << "Multi threaded - correct, faster (atomic)\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum)
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        double y = 4.0/(1.0+x*x);
#pragma omp atomic
        sum = sum + y;
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct more efficent - but "false sharing"
    // http://www.drdobbs.com/parallel/eliminate-false-sharing/217500206#
    std::cout << "Multi threaded - correct, even faster (array of sub results)\n";
    double * sum_a = new double[200]; // 200 > num threads
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
    int nthreads;
#pragma omp parallel
    {
        int id = omp_get_thread_num();
        int num_threads = omp_get_num_threads();
        if ( id == 0 ) { 
            nthreads = num_threads; 
        }
        int i;
        for ( i=id,sum_a[id]=0; i<num_steps; i+=num_threads ) { // x distributed like this 0 1 2 3 0 1 2 3 ...
            double x = (i+0.5)*step; // mid point
            double y = 4.0/(1.0+x*x);
            sum_a[id] += y;
        }
    }
    sum = 0.0;
    for (int i=0; i<nthreads; ++i) {
        sum += sum_a[i];
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct more more efficent - remove "false sharing"
    std::cout << "Multi threaded - correct, even faster (PADDED array of sub results)\n";
    int cacheline = 64; // cache line size in bytes
    int int_offset = cacheline/sizeof(int);
    double * sum_a2 = new double[200*int_offset]; // each element is on its own cache line
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();

#pragma omp parallel
    {
        int id = omp_get_thread_num();
        int num_threads = omp_get_num_threads();
//        if ( id == 0 ) 
#pragma omp master // same as above
        { 
            nthreads = num_threads; 
        }
#pragma omp barrier // make sure all thread get the above value
        int i;
        for ( i=id,sum_a2[id*int_offset]=0; i<num_steps; i+=num_threads ) { // 0 1 2 3 0 1 2 3 ...
            double x = (i+0.5)*step; // mid point
            double y = 4.0/(1.0+x*x);
            sum_a2[id*int_offset] += y;
        }
    }
    sum = 0.0;
    for (int i=0; i<nthreads; ++i) {
        sum += sum_a2[i*int_offset];
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct more efficent - huge array of subresults (size = num points)
    std::cout << "Multi threaded - correct, huge temp array\n";
    double * sum_a4 = new double[num_steps]; 
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();

#pragma omp parallel for
    for ( int i=0; i<num_steps; ++i ) {
        double x = (i+0.5)*step; // mid point
        double y = 4.0/(1.0+x*x);
        sum_a4[i] = y;
    }
    sum = 0.0;
    for (int i=0; i<num_steps; ++i) { // parallelize ?
        sum += sum_a4[i];
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }


    
    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct more more efficent - contiguous blocks
    std::cout << "Multi threaded - correct, contiguous blocks\n";
    double * sum_a3 = new double[200];
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();

#pragma omp parallel
    {
        int id = omp_get_thread_num();
        int block_size;
        if ( id == 0 ) { 
            nthreads = omp_get_num_threads(); 
        }
#pragma omp barrier // make sure all threads get nthreads
        block_size = num_steps/nthreads;

        int i;
// #pragma omp for -- is an error, next for loops is already parallelized 
        for ( i=id*block_size,sum_a3[id]=0; i<(id+1)*block_size && i<num_steps; ++i ) { // 000111222 .... 
            double x = (i+0.5)*step; // mid point
            double y = 4.0/(1.0+x*x);
            sum_a3[id] += y;
        }
    }
    sum = 0.0;
    for (int i=0; i<nthreads; ++i) {
        sum += sum_a3[i];
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }


    //////////////////////////////////////////////////////////////////////////////// 
    std::cout << "Multi threaded - correct, using local vars instead of padding\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();

#pragma omp parallel shared(sum)
    {
        double ps = 0; // local sum
#pragma omp for
        for (int i=0; i<num_steps; ++i) {
            double x = (i+0.5)*step; // mid point
            double y = 4.0/(1.0+x*x);
            ps += y;
        }
#pragma omp critical
        sum += ps;
    } // end parallel
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

    //////////////////////////////////////////////////////////////////////////////// 
    // parallel correct using reduction
    std::cout << "Multi threaded - correct, even faster (reduction)\n";
    sum = 0.0;
    step = 1.0/static_cast<double>(num_steps);
    begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for reduction (+:sum)
    for (int i=0; i<num_steps; ++i) {
        double x = (i+0.5)*step; // mid point
        double y = 4.0/(1.0+x*x);
        sum = sum + y;
    }
    end = std::chrono::high_resolution_clock::now();
    {
        auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
        std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
        std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
        std::cout << "Sum = " << sum/num_steps << std::endl;
    }

}
//Single threaded
//  1515040 ns: 
//Sum = 3.14159
//Multi threaded - incorrect
// 19904503 ns: 
//d2/d =   13.1379
//Sum = 0.900074
//Multi threaded - incorrect
//  4357139 ns: 
//d2/d =   2.87592
//Sum = 0.857616
//Multi threaded - correct, slow (critical)
//117996236 ns: 
//d3/d =   77.8832
//Sum = 3.14159
//Multi threaded - correct, faster (atomic)
// 82150783 ns: 
//d2/d =   54.2235
//Sum = 3.14159
//Multi threaded - correct, even faster (array of sub results)
//  4479841 ns: 
//d2/d =   2.95691
//Sum = 3.14159
//Multi threaded - correct, even faster (PADDED array of sub results)
//   141900 ns: 
//d2/d = 0.0936609
//Sum = 3.14159
//Multi threaded - correct, huge temp array
//   649404 ns: 
//d2/d =  0.428638
//Sum = 3.14159
//Multi threaded - correct, contiguous blocks
//  1125972 ns: 
//d2/d =  0.743196
//Sum = 3.14159
//Multi threaded - correct, using local vars instead of padding
//   115062 ns: 
//d2/d = 0.0759465
//Sum = 3.14159
//Multi threaded - correct, even faster (reduction)
//   101567 ns: 
//d2/d = 0.0670392
//Sum = 3.14159