// https://computing.llnl.gov/tutorials/openMP/
// g++ -fopenmp -Wall -Wextra -pedantic -std=c++11 pi.cpp
#include <iostream>
#include <iomanip>
#include <chrono>
#include <omp.h> // OMP functions
static long num_steps = 100000;
double step;
int main ()
{
double sum = 0.0;
double step = 1.0/static_cast<double>(num_steps);
// single threaded
std::cout << "Single threaded\n";
auto begin = std::chrono::high_resolution_clock::now();
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
sum = sum + 4.0/(1.0+x*x);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration << " ns: " << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
////////////////////////////////////////////////////////////////////////////////
// parallel incorrect - forgot declare as shared
std::cout << "Multi threaded - incorrect\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
sum = sum + 4.0/(1.0+x*x);
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel incorrect - data race
std::cout << "Multi threaded - incorrect\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum)
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
sum = sum + 4.0/(1.0+x*x);
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct inefficent
std::cout << "Multi threaded - correct, slow (critical)\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum)
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
#pragma omp critical
sum = sum + y;
}
end = std::chrono::high_resolution_clock::now();
auto duration3 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration3 << " ns: " << std::endl;
std::cout << "d3/d = " << std::setw(9) << static_cast<double>(duration3)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
////////////////////////////////////////////////////////////////////////////////
// parallel correct inefficent but better
std::cout << "Multi threaded - correct, faster (atomic)\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for shared(sum)
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
#pragma omp atomic
sum = sum + y;
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct more efficent - but "false sharing"
// http://www.drdobbs.com/parallel/eliminate-false-sharing/217500206#
std::cout << "Multi threaded - correct, even faster (array of sub results)\n";
double * sum_a = new double[200]; // 200 > num threads
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
int nthreads;
#pragma omp parallel
{
int id = omp_get_thread_num();
int num_threads = omp_get_num_threads();
if ( id == 0 ) {
nthreads = num_threads;
}
int i;
for ( i=id,sum_a[id]=0; i<num_steps; i+=num_threads ) { // x distributed like this 0 1 2 3 0 1 2 3 ...
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
sum_a[id] += y;
}
}
sum = 0.0;
for (int i=0; i<nthreads; ++i) {
sum += sum_a[i];
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct more more efficent - remove "false sharing"
std::cout << "Multi threaded - correct, even faster (PADDED array of sub results)\n";
int cacheline = 64; // cache line size in bytes
int int_offset = cacheline/sizeof(int);
double * sum_a2 = new double[200*int_offset]; // each element is on its own cache line
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel
{
int id = omp_get_thread_num();
int num_threads = omp_get_num_threads();
// if ( id == 0 )
#pragma omp master // same as above
{
nthreads = num_threads;
}
#pragma omp barrier // make sure all thread get the above value
int i;
for ( i=id,sum_a2[id*int_offset]=0; i<num_steps; i+=num_threads ) { // 0 1 2 3 0 1 2 3 ...
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
sum_a2[id*int_offset] += y;
}
}
sum = 0.0;
for (int i=0; i<nthreads; ++i) {
sum += sum_a2[i*int_offset];
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct more efficent - huge array of subresults (size = num points)
std::cout << "Multi threaded - correct, huge temp array\n";
double * sum_a4 = new double[num_steps];
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for ( int i=0; i<num_steps; ++i ) {
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
sum_a4[i] = y;
}
sum = 0.0;
for (int i=0; i<num_steps; ++i) { // parallelize ?
sum += sum_a4[i];
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct more more efficent - contiguous blocks
std::cout << "Multi threaded - correct, contiguous blocks\n";
double * sum_a3 = new double[200];
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel
{
int id = omp_get_thread_num();
int block_size;
if ( id == 0 ) {
nthreads = omp_get_num_threads();
}
#pragma omp barrier // make sure all threads get nthreads
block_size = num_steps/nthreads;
int i;
// #pragma omp for -- is an error, next for loops is already parallelized
for ( i=id*block_size,sum_a3[id]=0; i<(id+1)*block_size && i<num_steps; ++i ) { // 000111222 ....
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
sum_a3[id] += y;
}
}
sum = 0.0;
for (int i=0; i<nthreads; ++i) {
sum += sum_a3[i];
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
std::cout << "Multi threaded - correct, using local vars instead of padding\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel shared(sum)
{
double ps = 0; // local sum
#pragma omp for
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
ps += y;
}
#pragma omp critical
sum += ps;
} // end parallel
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
////////////////////////////////////////////////////////////////////////////////
// parallel correct using reduction
std::cout << "Multi threaded - correct, even faster (reduction)\n";
sum = 0.0;
step = 1.0/static_cast<double>(num_steps);
begin = std::chrono::high_resolution_clock::now();
#pragma omp parallel for reduction (+:sum)
for (int i=0; i<num_steps; ++i) {
double x = (i+0.5)*step; // mid point
double y = 4.0/(1.0+x*x);
sum = sum + y;
}
end = std::chrono::high_resolution_clock::now();
{
auto duration2 = std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count();
std::cout << std::setw(9) << duration2 << " ns: " << std::endl;
std::cout << "d2/d = " << std::setw(9) << static_cast<double>(duration2)/duration << std::endl;
std::cout << "Sum = " << sum/num_steps << std::endl;
}
}
//Single threaded
// 1515040 ns:
//Sum = 3.14159
//Multi threaded - incorrect
// 19904503 ns:
//d2/d = 13.1379
//Sum = 0.900074
//Multi threaded - incorrect
// 4357139 ns:
//d2/d = 2.87592
//Sum = 0.857616
//Multi threaded - correct, slow (critical)
//117996236 ns:
//d3/d = 77.8832
//Sum = 3.14159
//Multi threaded - correct, faster (atomic)
// 82150783 ns:
//d2/d = 54.2235
//Sum = 3.14159
//Multi threaded - correct, even faster (array of sub results)
// 4479841 ns:
//d2/d = 2.95691
//Sum = 3.14159
//Multi threaded - correct, even faster (PADDED array of sub results)
// 141900 ns:
//d2/d = 0.0936609
//Sum = 3.14159
//Multi threaded - correct, huge temp array
// 649404 ns:
//d2/d = 0.428638
//Sum = 3.14159
//Multi threaded - correct, contiguous blocks
// 1125972 ns:
//d2/d = 0.743196
//Sum = 3.14159
//Multi threaded - correct, using local vars instead of padding
// 115062 ns:
//d2/d = 0.0759465
//Sum = 3.14159
//Multi threaded - correct, even faster (reduction)
// 101567 ns:
//d2/d = 0.0670392
//Sum = 3.14159