Skip to content

Commit

Permalink
WIP commit, apologies. Some errors in openmp variants with this version
Browse files Browse the repository at this point in the history
  • Loading branch information
anyzelman committed Jan 8, 2025
1 parent a5f7121 commit f55a89c
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 36 deletions.
7 changes: 6 additions & 1 deletion tests/performance/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ add_grb_executables( fma fma.cpp $<TARGET_OBJECTS:bench_kernels>
ADDITIONAL_LINK_LIBRARIES "rt"
)

add_grb_executables( fma-openmp fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
add_grb_executables( fma-blocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
BACKENDS reference_omp NO_BACKEND_NAME
ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
)

add_grb_executables( fma-nonblocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
BACKENDS nonblocking NO_BACKEND_NAME
ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
)

add_grb_executables( reduce reduce.cpp $<TARGET_OBJECTS:bench_kernels>
BACKENDS reference NO_BACKEND_NAME
)
Expand Down
22 changes: 19 additions & 3 deletions tests/performance/bench_kernels.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,25 @@ void bench_kernels_axpy(
assert( a != x );
assert( a != y );
assert( x != y );
#pragma omp parallel for schedule(static)
for( size_t i = 0; i < n; ++i ) {
a[ i ] = alpha * x[ i ] + y[ i ];
#pragma omp parallel
{
const size_t P = omp_get_num_threads();
const size_t s = omp_get_thread_num();
const size_t chunk = (n % P == 0) ? (n/P) : (n/P) + 1;
size_t start = chunk * s;
if( start > n - 1 ) {
start = n - 1;
}
size_t end = start + chunk;
if( end > n - 1 ) {
end = n - 1;
}
assert( start <= end );
if( start != end ) {
for( size_t i = start; i < end; ++i ) {
a[ i ] = alpha * x[ i ] + y[ i ];
}
}
}
}

Expand Down
54 changes: 26 additions & 28 deletions tests/performance/fma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,15 @@ void test( const struct Input &in, struct Output &out ) {
// set constant multiplicant to x
const double alpha = 2.0;

// WARNING: ALP incurs performance loss unless compiled using the nonblockings
// backend
if( mode == TEMPLATED ) {
double ttime = timer.time();
// get cache `hot'
out.error = grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv,
reals );
out.error = grb::set< grb::descriptors::dense >( zv, yv );
out.error = grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals );
if( out.error != SUCCESS ) {
std::cerr << "grb::eWiseMulAdd returns non-SUCCESS exit code "
std::cerr << "grb::eWiseMul returns non-SUCCESS exit code "
<< grb::toString( out.error ) << "." << std::endl;
std::cout << "Test FAILED\n" << std::endl;
return;
Expand All @@ -108,24 +110,7 @@ void test( const struct Input &in, struct Output &out ) {
} else {
out.reps_used = in.rep;
}
out.times.preamble = timer.time();
timer.reset();
// benchmark templated axpy
for( size_t i = 0; i < out.reps_used; ++i ) {
out.error = grb::set( zv, 0 );
if( out.error != grb::SUCCESS ) {
std::cerr << "Error during clearing of zv "
<< grb::toString( out.error ) << std::endl;
std::cout << "Test FAILED\n" << std::endl;
return;
}
(void) grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv,
reals );
}
out.times.useful = timer.time() / static_cast< double >( out.reps_used );

// postamble
timer.reset();
// verify
double checksum = 0;
for( size_t i = 0; i < in.n; ++i ) {
checksum += z[ i ];
Expand All @@ -138,14 +123,27 @@ void test( const struct Input &in, struct Output &out ) {
}
}
std::cout << "Checksum: " << checksum << std::endl;
out.times.postamble = timer.time();
out.times.preamble = timer.time();

// benchmark ALP axpy
timer.reset();
for( size_t i = 0; i < out.reps_used; ++i ) {
// zv[ i ] = alpha * xv[ i ] + yv[ i ]
(void) grb::set< grb::descriptors::dense >( zv, yv );
(void) grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals );
}
out.times.useful = timer.time() / static_cast< double >( out.reps_used );

// postamble
out.times.postamble = 0;
}

if( mode == LAMBDA ) {
double ltime = timer.time();
// get cache `hot'
out.error = grb::eWiseLambda(
[ &zv, &alpha, &xv, &yv, &reals ]( const size_t i ) {
// zv[ i ] = alpha * xv[ i ] + yv[ i ]
(void) grb::apply( zv[ i ], alpha, xv[ i ],
reals.getMultiplicativeOperator() );
(void) grb::foldl( zv[ i ], yv[ i ], reals.getAdditiveOperator() );
Expand Down Expand Up @@ -311,26 +309,26 @@ int main( int argc, char ** argv ) {
grb::Benchmarker< AUTOMATIC > bench;

// start functional test
std::cout << "\nBenchmark label: grb::eWiseApply (axpy) of size " << in.n
<< std::endl;
std::cout << "\nBenchmark label: grb::set + grb::eWiseMul (axpy) of size "
<< in.n << std::endl;
out.error = SUCCESS;
grb::RC rc = bench.exec( &(test< TEMPLATED >), in, out, 1, outer, true );
if( rc != SUCCESS || out.error != SUCCESS ) {
std::cerr << "Functional test exits with nonzero exit code. "
<< "Benchmarker reports: " << grb::toString( rc )
<< "; test reports:" << grb::toString( out.error ) << "." << std::endl;
std::cout << "Test FAILED\n" << std::endl;
return EXIT_FAILURE;
return 40;
}
std::cout << "\nBenchmark label: grb::eWiseLambda (axpy) of size " << in.n
<< std::endl;
rc = bench.exec( &(test< LAMBDA >), in, out, 1, outer, true );
if( rc != SUCCESS || out.error != SUCCESS ) {
if( rc != SUCCESS || out.error != SUCCESS ) {
std::cerr << "Functional test exits with nonzero exit code. "
<< "Benchmarker reports: " << grb::toString( rc )
<< "; test reports:" << grb::toString( out.error ) << "." << std::endl;
std::cout << "Test FAILED\n" << std::endl;
return EXIT_FAILURE;
return 50;
}

std::cout << "\nBenchmark label: compiler-optimised axpy of size " << in.n
Expand All @@ -341,7 +339,7 @@ int main( int argc, char ** argv ) {
<< "Benchmarker reports: " << grb::toString( rc )
<< "; test reports:" << grb::toString( out.error ) << "." << std::endl;
std::cout << "Test FAILED\n" << std::endl;
return EXIT_FAILURE;
return 60;
}

std::cout << "NOTE: please check the above performance figures manually-- "
Expand Down
16 changes: 12 additions & 4 deletions tests/performance/performancetests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,18 @@ if [[ -z $DATASETTORUN && ( -z "$EXPTYPE" || "$EXPTYPE" == "KERNEL" ) ]]; then
echo ">>> [ ] [x] Testing semiring axpy versus hardcoded axpy over"
echo " 10 000 000 doubles, using the OpenMP reference backend"
echo " "
${TEST_BIN_DIR}/fma-openmp &> ${TEST_OUT_DIR}/fma-openmp 10000000 0
head -1 ${TEST_OUT_DIR}/fma-openmp
tail -2 ${TEST_OUT_DIR}/fma-openmp
egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-openmp | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks
${TEST_BIN_DIR}/fma-blocking &> ${TEST_OUT_DIR}/fma-blocking 10000000 0
head -1 ${TEST_OUT_DIR}/fma-blocking
tail -2 ${TEST_OUT_DIR}/fma-blocking
egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-blocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks

echo ">>> [ ] [x] Testing semiring axpy versus hardcoded axpy over"
echo " 10 000 000 doubles, using the nonblocking backend"
echo " "
${TEST_BIN_DIR}/fma-nonblocking &> ${TEST_OUT_DIR}/fma-nonblocking 10000000 0
head -1 ${TEST_OUT_DIR}/fma-nonblocking
tail -2 ${TEST_OUT_DIR}/fma-nonblocking
egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-nonblocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks

echo ">>> [ ] [x] Testing monoid reduce versus hardcoded reduce over"
echo " 10 000 000 doubles, using the OpenMP reference backend"
Expand Down

0 comments on commit f55a89c

Please sign in to comment.