diff --git a/tests/performance/CMakeLists.txt b/tests/performance/CMakeLists.txt
index 84d2147d2..5856af2b0 100644
--- a/tests/performance/CMakeLists.txt
+++ b/tests/performance/CMakeLists.txt
@@ -33,11 +33,16 @@ add_grb_executables( fma fma.cpp $<TARGET_OBJECTS:bench_kernels>
 	ADDITIONAL_LINK_LIBRARIES "rt"
 )
 
-add_grb_executables( fma-openmp fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
+add_grb_executables( fma-blocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
 	BACKENDS reference_omp NO_BACKEND_NAME
 	ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
 )
 
+add_grb_executables( fma-nonblocking fma.cpp $<TARGET_OBJECTS:bench_kernels_omp>
+	BACKENDS nonblocking NO_BACKEND_NAME
+	ADDITIONAL_LINK_LIBRARIES OpenMP::OpenMP_CXX "rt"
+)
+
 add_grb_executables( reduce reduce.cpp $<TARGET_OBJECTS:bench_kernels>
 	BACKENDS reference NO_BACKEND_NAME
 )
diff --git a/tests/performance/bench_kernels.c b/tests/performance/bench_kernels.c
index 98523c0cd..105f55a8d 100644
--- a/tests/performance/bench_kernels.c
+++ b/tests/performance/bench_kernels.c
@@ -30,9 +30,25 @@ void bench_kernels_axpy(
 	assert( a != x );
 	assert( a != y );
 	assert( x != y );
-	#pragma omp parallel for schedule(static)
-	for( size_t i = 0; i < n; ++i ) {
-		a[ i ] = alpha * x[ i ] + y[ i ];
+	#pragma omp parallel
+	{
+		const size_t P = omp_get_num_threads();
+		const size_t s = omp_get_thread_num();
+		const size_t chunk = (n % P == 0) ? (n/P) : (n/P) + 1;
+		size_t start = chunk * s;
+		if( start > n - 1 ) {
+			start = n - 1;
+		}
+		size_t end = start + chunk;
+		if( end > n - 1 ) {
+			end = n - 1;
+		}
+		assert( start <= end );
+		if( start != end ) {
+			for( size_t i = start; i < end; ++i ) {
+				a[ i ] = alpha * x[ i ] + y[ i ];
+			}
+		}
 	}
 }
 
diff --git a/tests/performance/fma.cpp b/tests/performance/fma.cpp
index a9ce4d845..73cda5cb1 100644
--- a/tests/performance/fma.cpp
+++ b/tests/performance/fma.cpp
@@ -87,13 +87,15 @@ void test( const struct Input &in, struct Output &out ) {
 	// set constant multiplicant to x
 	const double alpha = 2.0;
 
+	// WARNING: ALP incurs performance loss unless compiled using the nonblockings
+	//          backend
 	if( mode == TEMPLATED ) {
 		double ttime = timer.time();
 		// get cache `hot'
-		out.error = grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv,
-			reals );
+		out.error = grb::set< grb::descriptors::dense >( zv, yv );
+		out.error = grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals );
 		if( out.error != SUCCESS ) {
-			std::cerr << "grb::eWiseMulAdd returns non-SUCCESS exit code "
+			std::cerr << "grb::eWiseMul returns non-SUCCESS exit code "
 				<< grb::toString( out.error ) << "." << std::endl;
 			std::cout << "Test FAILED\n" << std::endl;
 			return;
@@ -108,24 +110,7 @@ void test( const struct Input &in, struct Output &out ) {
 		} else {
 			out.reps_used = in.rep;
 		}
-		out.times.preamble = timer.time();
-		timer.reset();
-		// benchmark templated axpy
-		for( size_t i = 0; i < out.reps_used; ++i ) {
-			out.error = grb::set( zv, 0 );
-			if( out.error != grb::SUCCESS ) {
-				std::cerr << "Error during clearing of zv "
-					<< grb::toString( out.error ) << std::endl;
-				std::cout << "Test FAILED\n" << std::endl;
-				return;
-			}
-			(void) grb::eWiseMulAdd< grb::descriptors::dense >( zv, alpha, xv, yv,
-				reals );
-		}
-		out.times.useful = timer.time() / static_cast< double >( out.reps_used );
-
-		// postamble
-		timer.reset();
+		// verify
 		double checksum = 0;
 		for( size_t i = 0; i < in.n; ++i ) {
 			checksum += z[ i ];
@@ -138,7 +123,19 @@ void test( const struct Input &in, struct Output &out ) {
 			}
 		}
 		std::cout << "Checksum: " << checksum << std::endl;
-		out.times.postamble = timer.time();
+		out.times.preamble = timer.time();
+
+		// benchmark ALP axpy
+		timer.reset();
+		for( size_t i = 0; i < out.reps_used; ++i ) {
+			// zv[ i ] = alpha * xv[ i ] + yv[ i ]
+			(void) grb::set< grb::descriptors::dense >( zv, yv );
+			(void) grb::eWiseMul< grb::descriptors::dense >( zv, alpha, xv, reals );
+		}
+		out.times.useful = timer.time() / static_cast< double >( out.reps_used );
+
+		// postamble
+		out.times.postamble = 0;
 	}
 
 	if( mode == LAMBDA ) {
@@ -146,6 +143,7 @@ void test( const struct Input &in, struct Output &out ) {
 		// get cache `hot'
 		out.error = grb::eWiseLambda(
 			[ &zv, &alpha, &xv, &yv, &reals ]( const size_t i ) {
+				// zv[ i ] = alpha * xv[ i ] + yv[ i ]
 				(void) grb::apply( zv[ i ], alpha, xv[ i ],
 					reals.getMultiplicativeOperator() );
 				(void) grb::foldl( zv[ i ], yv[ i ], reals.getAdditiveOperator() );
@@ -311,8 +309,8 @@ int main( int argc, char ** argv ) {
 	grb::Benchmarker< AUTOMATIC > bench;
 
 	// start functional test
-	std::cout << "\nBenchmark label: grb::eWiseApply (axpy) of size " << in.n
-		<< std::endl;
+	std::cout << "\nBenchmark label: grb::set + grb::eWiseMul (axpy) of size "
+		<< in.n << std::endl;
 	out.error = SUCCESS;
 	grb::RC rc = bench.exec( &(test< TEMPLATED >), in, out, 1, outer, true );
 	if( rc != SUCCESS || out.error != SUCCESS ) {
@@ -320,17 +318,17 @@ int main( int argc, char ** argv ) {
 			<< "Benchmarker reports: " << grb::toString( rc )
 			<< "; test reports:"  << grb::toString( out.error ) << "." << std::endl;
 		std::cout << "Test FAILED\n" << std::endl;
-		return EXIT_FAILURE;
+		return 40;
 	}
 	std::cout << "\nBenchmark label: grb::eWiseLambda (axpy) of size " << in.n
 		<< std::endl;
 	rc = bench.exec( &(test< LAMBDA >), in, out, 1, outer, true );
-		if( rc != SUCCESS || out.error != SUCCESS ) {
+	if( rc != SUCCESS || out.error != SUCCESS ) {
 		std::cerr << "Functional test exits with nonzero exit code. "
 			<< "Benchmarker reports: " << grb::toString( rc )
 			<< "; test reports:"  << grb::toString( out.error ) << "." << std::endl;
 		std::cout << "Test FAILED\n" << std::endl;
-		return EXIT_FAILURE;
+		return 50;
 	}
 
 	std::cout << "\nBenchmark label: compiler-optimised axpy of size " << in.n
@@ -341,7 +339,7 @@ int main( int argc, char ** argv ) {
 			<< "Benchmarker reports: " << grb::toString( rc )
 			<< "; test reports:"  << grb::toString( out.error ) << "." << std::endl;
 		std::cout << "Test FAILED\n" << std::endl;
-		return EXIT_FAILURE;
+		return 60;
 	}
 
 	std::cout << "NOTE: please check the above performance figures manually-- "
diff --git a/tests/performance/performancetests.sh b/tests/performance/performancetests.sh
index 576b20fc5..268045b88 100755
--- a/tests/performance/performancetests.sh
+++ b/tests/performance/performancetests.sh
@@ -174,10 +174,18 @@ if [[ -z $DATASETTORUN && ( -z "$EXPTYPE" || "$EXPTYPE" == "KERNEL" ) ]]; then
 	echo ">>>      [ ]           [x]       Testing semiring axpy versus hardcoded axpy over"
 	echo "                                 10 000 000 doubles, using the OpenMP reference backend"
 	echo " "
-	${TEST_BIN_DIR}/fma-openmp &> ${TEST_OUT_DIR}/fma-openmp 10000000 0
-	head -1 ${TEST_OUT_DIR}/fma-openmp
-	tail -2 ${TEST_OUT_DIR}/fma-openmp
-	egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-openmp | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks
+	${TEST_BIN_DIR}/fma-blocking &> ${TEST_OUT_DIR}/fma-blocking 10000000 0
+	head -1 ${TEST_OUT_DIR}/fma-blocking
+	tail -2 ${TEST_OUT_DIR}/fma-blocking
+	egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-blocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks
+
+	echo ">>>      [ ]           [x]       Testing semiring axpy versus hardcoded axpy over"
+	echo "                                 10 000 000 doubles, using the nonblocking backend"
+	echo " "
+	${TEST_BIN_DIR}/fma-nonblocking &> ${TEST_OUT_DIR}/fma-nonblocking 10000000 0
+	head -1 ${TEST_OUT_DIR}/fma-nonblocking
+	tail -2 ${TEST_OUT_DIR}/fma-nonblocking
+	egrep 'label|Overall timings|0,' ${TEST_OUT_DIR}/fma-nonblocking | grep -v Outer >> ${TEST_OUT_DIR}/benchmarks
 
 	echo ">>>      [ ]           [x]       Testing monoid reduce versus hardcoded reduce over"
 	echo "                                 10 000 000 doubles, using the OpenMP reference backend"