diff --git a/tests/performance/CMakeLists.txt b/tests/performance/CMakeLists.txt index 84d2147d2..5856af2b0 100644 --- a/tests/performance/CMakeLists.txt +++ b/tests/performance/CMakeLists.txt @@ -33,11 +33,16 @@ add_grb_executables( fma fma.cpp $ ADDITIONAL_LINK_LIBRARIES "rt" ) -add_grb_executables( fma-openmp fma.cpp $ +add_grb_executables( fma-blocking fma.cpp $ BACKENDS reference_omp NO_BACKEND_NAME ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt" ) +add_grb_executables( fma-nonblocking fma.cpp $ + BACKENDS nonblocking NO_BACKEND_NAME + ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt" +) + add_grb_executables( reduce reduce.cpp $ BACKENDS reference NO_BACKEND_NAME ) diff --git a/tests/performance/bench_kernels.c b/tests/performance/bench_kernels.c index 98523c0cd..105f55a8d 100644 --- a/tests/performance/bench_kernels.c +++ b/tests/performance/bench_kernels.c @@ -30,9 +30,25 @@ void bench_kernels_axpy( assert( a != x ); assert( a != y ); assert( x != y ); - #pragma omp parallel for schedule(static) - for( size_t i = 0; i < n; ++i ) { - a[ i ] = alpha * x[ i ] + y[ i ]; + #pragma omp parallel + { + const size_t P = omp_get_num_threads(); + const size_t s = omp_get_thread_num(); + const size_t chunk = (n % P == 0) ? (n/P) : (n/P) + 1; + size_t start = chunk * s; + if( start > n - 1 ) { + start = n - 1; + } + size_t end = start + chunk; + if( end > n - 1 ) { + end = n - 1; + } + assert( start <= end ); + if( start != end ) { + for( size_t i = start; i < end; ++i ) { + a[ i ] = alpha * x[ i ] + y[ i ]; + } + } } } diff --git a/tests/performance/fma.cpp b/tests/performance/fma.cpp index a9ce4d845..73cda5cb1 100644 --- a/tests/performance/fma.cpp +++ b/tests/performance/fma.cpp @@ -87,13 +87,15 @@ void test( const struct Input &in, struct Output &out ) { // set constant multiplicant to x const double alpha = 2.0; + // WARNING: ALP incurs performance loss unless compiled using the nonblockings + // backend if( mode == TEMPLATED ) { double ttime = timer.time(); // get cache `hot' - out.error = grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv, - reals ); + out.error = grb::set< grb::descriptors::dense >( zv, yv ); + out.error = grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals ); if( out.error != SUCCESS ) { - std::cerr << "grb::eWiseMulAdd returns non-SUCCESS exit code " + std::cerr << "grb::eWiseMul returns non-SUCCESS exit code " << grb::toString( out.error ) << "." << std::endl; std::cout << "Test FAILED\n" << std::endl; return; @@ -108,24 +110,7 @@ void test( const struct Input &in, struct Output &out ) { } else { out.reps_used = in.rep; } - out.times.preamble = timer.time(); - timer.reset(); - // benchmark templated axpy - for( size_t i = 0; i < out.reps_used; ++i ) { - out.error = grb::set( zv, 0 ); - if( out.error != grb::SUCCESS ) { - std::cerr << "Error during clearing of zv " - << grb::toString( out.error ) << std::endl; - std::cout << "Test FAILED\n" << std::endl; - return; - } - (void) grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv, - reals ); - } - out.times.useful = timer.time() / static_cast< double >( out.reps_used ); - - // postamble - timer.reset(); + // verify double checksum = 0; for( size_t i = 0; i < in.n; ++i ) { checksum += z[ i ]; @@ -138,7 +123,19 @@ void test( const struct Input &in, struct Output &out ) { } } std::cout << "Checksum: " << checksum << std::endl; - out.times.postamble = timer.time(); + out.times.preamble = timer.time(); + + // benchmark ALP axpy + timer.reset(); + for( size_t i = 0; i < out.reps_used; ++i ) { + // zv[ i ] = alpha * xv[ i ] + yv[ i ] + (void) grb::set< grb::descriptors::dense >( zv, yv ); + (void) grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals ); + } + out.times.useful = timer.time() / static_cast< double >( out.reps_used ); + + // postamble + out.times.postamble = 0; } if( mode == LAMBDA ) { @@ -146,6 +143,7 @@ void test( const struct Input &in, struct Output &out ) { // get cache `hot' out.error = grb::eWiseLambda( [ &zv, &alpha, &xv, &yv, &reals ]( const size_t i ) { + // zv[ i ] = alpha * xv[ i ] + yv[ i ] (void) grb::apply( zv[ i ], alpha, xv[ i ], reals.getMultiplicativeOperator() ); (void) grb::foldl( zv[ i ], yv[ i ], reals.getAdditiveOperator() ); @@ -311,8 +309,8 @@ int main( int argc, char ** argv ) { grb::Benchmarker< AUTOMATIC > bench; // start functional test - std::cout << "\nBenchmark label: grb::eWiseApply (axpy) of size " << in.n - << std::endl; + std::cout << "\nBenchmark label: grb::set + grb::eWiseMul (axpy) of size " + << in.n << std::endl; out.error = SUCCESS; grb::RC rc = bench.exec( &(test< TEMPLATED >), in, out, 1, outer, true ); if( rc != SUCCESS || out.error != SUCCESS ) { @@ -320,17 +318,17 @@ int main( int argc, char ** argv ) { << "Benchmarker reports: " << grb::toString( rc ) << "; test reports:" << grb::toString( out.error ) << "." << std::endl; std::cout << "Test FAILED\n" << std::endl; - return EXIT_FAILURE; + return 40; } std::cout << "\nBenchmark label: grb::eWiseLambda (axpy) of size " << in.n << std::endl; rc = bench.exec( &(test< LAMBDA >), in, out, 1, outer, true ); - if( rc != SUCCESS || out.error != SUCCESS ) { + if( rc != SUCCESS || out.error != SUCCESS ) { std::cerr << "Functional test exits with nonzero exit code. " << "Benchmarker reports: " << grb::toString( rc ) << "; test reports:" << grb::toString( out.error ) << "." << std::endl; std::cout << "Test FAILED\n" << std::endl; - return EXIT_FAILURE; + return 50; } std::cout << "\nBenchmark label: compiler-optimised axpy of size " << in.n @@ -341,7 +339,7 @@ int main( int argc, char ** argv ) { << "Benchmarker reports: " << grb::toString( rc ) << "; test reports:" << grb::toString( out.error ) << "." << std::endl; std::cout << "Test FAILED\n" << std::endl; - return EXIT_FAILURE; + return 60; } std::cout << "NOTE: please check the above performance figures manually-- " diff --git a/tests/performance/performancetests.sh b/tests/performance/performancetests.sh index 576b20fc5..268045b88 100755 --- a/tests/performance/performancetests.sh +++ b/tests/performance/performancetests.sh @@ -174,10 +174,18 @@ if [[ -z $DATASETTORUN && ( -z "$EXPTYPE" || "$EXPTYPE" == "KERNEL" ) ]]; then echo ">>> [ ] [x] Testing semiring axpy versus hardcoded axpy over" echo " 10 000 000 doubles, using the OpenMP reference backend" echo " " - ${TEST_BIN_DIR}/fma-openmp &> ${TEST_OUT_DIR}/fma-openmp 10000000 0 - head -1 ${TEST_OUT_DIR}/fma-openmp - tail -2 ${TEST_OUT_DIR}/fma-openmp - egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-openmp | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks + ${TEST_BIN_DIR}/fma-blocking &> ${TEST_OUT_DIR}/fma-blocking 10000000 0 + head -1 ${TEST_OUT_DIR}/fma-blocking + tail -2 ${TEST_OUT_DIR}/fma-blocking + egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-blocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks + + echo ">>> [ ] [x] Testing semiring axpy versus hardcoded axpy over" + echo " 10 000 000 doubles, using the nonblocking backend" + echo " " + ${TEST_BIN_DIR}/fma-nonblocking &> ${TEST_OUT_DIR}/fma-nonblocking 10000000 0 + head -1 ${TEST_OUT_DIR}/fma-nonblocking + tail -2 ${TEST_OUT_DIR}/fma-nonblocking + egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-nonblocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks echo ">>> [ ] [x] Testing monoid reduce versus hardcoded reduce over" echo " 10 000 000 doubles, using the OpenMP reference backend"