diff --git a/.github/workflows/dbg_smoke.yml b/.github/workflows/dbg_smoke.yml
new file mode 100644
index 000000000..8fb2795ce
--- /dev/null
+++ b/.github/workflows/dbg_smoke.yml
@@ -0,0 +1,42 @@
+
+name: debug-smoke-tests
+
+on: [push]
+
+env:
+  BUILD_TYPE: Debug
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install required packages
+      run: sudo apt-get install -y libnuma-dev
+
+    - name: Configure
+      run: mkdir build && cd build && ../bootstrap.sh --prefix=../install --debug-build
+
+    - name: Build
+      working-directory: ${{github.workspace}}/build
+      run: make -j4
+
+    - name: Install
+      working-directory: ${{github.workspace}}/build
+      run: make -j4 install
+
+    - name: Test
+      working-directory: ${{github.workspace}}/build
+      run: make -j4 smoketests &> smoketests.log
+
+    - name: Check
+      working-directory: ${{github.workspace}}/build
+      run: ../tests/summarise.sh smoketests.log
+
+    - name: DumpLogOnFailure
+      if: failure()
+      working-directory: ${{github.workspace}}/build
+      run: cat smoketests.log
+
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
index c302ebdb2..884b2f74f 100644
--- a/.github/workflows/smoke.yml
+++ b/.github/workflows/smoke.yml
@@ -1,5 +1,5 @@
 
-name: smoke-tests
+name: release-smoke-tests
 
 on: [push]
 
@@ -35,3 +35,8 @@ jobs:
       working-directory: ${{github.workspace}}/build
       run: ../tests/summarise.sh smoketests.log
 
+    - name: DumpLogOnFailure
+      if: failure()
+      working-directory: ${{github.workspace}}/build
+      run: cat smoketests.log
+
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c6a81c040..60b3410d6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,7 +30,7 @@ stages:
 #    exclude:
 #      - build/**/*.o
 #      - build/**/*.o.d
-#    expire_in: 30 minutes
+#    expire_in: 80 minutes
 
 
 #build_debug_centos_8:
@@ -122,13 +122,16 @@ build_test:
     - apt update && apt -y install make cmake libnuma-dev coreutils
   script:
     - mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install && make -j$(nproc) build_tests_all
+    - strip -s $(find tests/unit/ -type f -executable -print) $(find tests/smoke/ -type f -executable -print) $(find tests/performance/ -type f -executable -print)
   artifacts:
     paths:
       - build/
     exclude:
       - build/**/*.o
       - build/**/*.o.d
-    expire_in: 30 minutes
+      - build/**/CMakeFiles
+      - build/**/*.dir
+    expire_in: 80 minutes
 
 
 build_debug2_tests:
@@ -222,7 +225,7 @@ build_debug:
     exclude:
       - build/**/*.o
       - build/**/*.o.d
-    expire_in: 30 minutes
+    expire_in: 43 minutes
 
 
 test_smoke_debug:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7dc72dd2..344216e50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@
 cmake_minimum_required( VERSION 3.13 )
 
 set( MAJORVERSION 0 )
-set( MINORVERSION 6 )
+set( MINORVERSION 7 )
 set( BUGVERSION 0 )
 set( VERSION "${MAJORVERSION}.${MINORVERSION}.${BUGVERSION}" )
 
@@ -51,6 +51,8 @@ endif()
 # to choose backends and dependencies
 option( WITH_REFERENCE_BACKEND "With Reference backend" ON )
 option( WITH_OMP_BACKEND "With OMP backend" ON )
+option( WITH_HYPERDAGS_BACKEND "With Hyperdags backend" ON )
+option( WITH_NONBLOCKING_BACKEND "With Nonblocking backend" ON )
 option( WITH_NUMA "With NUMA support" ON )
 option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF )
 # the following options depend on LPF_INSTALL_PATH being set
@@ -61,6 +63,9 @@ LPF_INSTALL_PATH set)" ON LPF_INSTALL_PATH OFF
 cmake_dependent_option( WITH_HYBRID_BACKEND "Also build the Hybrid backend \
 (needs LPF_INSTALL_PATH set)" ON LPF_INSTALL_PATH OFF
 )
+# other dependent options
+cmake_dependent_option( WITH_HYPERDAGS_BACKEND "Building the Hyperdags backend needs \
+	WITH_HYPERDAGS_USING set" ON WITH_HYPERDAGS_USING OFF )
 # to customize build flags for either backends or tests
 option( COMMON_COMPILE_DEFINITIONS
 	"Compilation definitions for BOTH backends and tests; they override the defaults"
@@ -117,6 +122,7 @@ endif()
 
 if( NOT WITH_REFERENCE_BACKEND AND
 	NOT WITH_OMP_BACKEND AND
+	NOT WITH_NONBLOCKING_BACKEND AND
 	NOT WITH_BSP1D_BACKEND AND
 	NOT WITH_HYBRID_BACKEND )
 	message( FATAL_ERROR "At least one backend should be enabled")
@@ -188,13 +194,18 @@ endif()
 # by default no headers are built
 set( WITH_REFERENCE_BACKEND_HEADERS OFF )
 set( WITH_OMP_BACKEND_HEADERS OFF )
+set( WITH_HYPERDAGS_BACKEND_HEADERS OFF )
 
 # activate headers based on requested backends
-if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND )
-	# both reference and bsp1d backends need reference headers
+if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND OR WITH_NONBLOCKING_BACKEND )
+	# reference, bsp1d and nonblocking backends need reference headers
 	set( WITH_REFERENCE_BACKEND_HEADERS ON )
 endif()
 
+if( WITH_HYPERDAGS_BACKEND )
+	set( WITH_HYPERDAGS_BACKEND_HEADERS ON )
+endif()
+
 if( WITH_OMP_BACKEND OR WITH_HYBRID_BACKEND )
 	# both reference_omp and hynrid backends need reference headers
 	set( WITH_OMP_BACKEND_HEADERS ON )
@@ -218,13 +229,28 @@ add_subdirectory( examples )
 
 ### DOXYGEN DOCUMENTATION GENERATION
 
-set( DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/code" )
+set( DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/developer" )
 add_custom_command( OUTPUT "${DOCS_DIR}"
-	COMMAND bash -c "if [[ ! -d docs/code ]]; then doxygen docs/doxy.conf &> doxygen.log; fi"
+	COMMAND bash -c "doxygen docs/doxy.conf &> doxygen-developer.log;"
 	WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
 	DEPENDS "${PROJECT_SOURCE_DIR}/docs/doxy.conf"
 	COMMENT "producing code documentation in ${DOCS_DIR}"
 	VERBATIM
 	#USES_TERMINAL
 )
-add_custom_target( docs DEPENDS "${DOCS_DIR}" )
+add_custom_target( devdocs DEPENDS "${DOCS_DIR}" )
+
+set( PUBLIC_DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/user" )
+add_custom_command( OUTPUT "${PUBLIC_DOCS_DIR}"
+	COMMAND bash -c "doxygen docs/user.conf &> doxygen-user.log;"
+	WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+	DEPENDS "${PROJECT_SOURCE_DIR}/docs/user.conf"
+	COMMENT "producing public code documentation in ${PUBLIC_DOCS_DIR}"
+	VERBATIM
+)
+add_custom_target( userdocs DEPENDS "${PUBLIC_DOCS_DIR}" )
+add_custom_target( docs )
+add_dependencies( docs userdocs devdocs )
+
+message( "Compiling with the following backends: ${AVAILABLE_BACKENDS}\n" )
+
diff --git a/NOTICE b/NOTICE
index 3f1bf625d..3c370eca4 100644
--- a/NOTICE
+++ b/NOTICE
@@ -29,6 +29,8 @@ to Huawei Technologies Co., Ltd. or one of its subsidiaries:
 
  - Auke Booij, Huawei Technologies Switzerland AG; 2021.
 
+ - Anders Hansson, Huawei Technologies Switzerland AG; 2022-2023.
+
 The experimental banshee backend has been developed in collaboration with
 Prof. Luca Benini at ETH Zuerich and his group. In particular this backend
 is with great thanks due to Dan, Paul Scheffler, Fabian Schuiki, and Samuel
diff --git a/README.md b/README.md
index ae65c9547..ff0b89d1e 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 <pre>
-   _____  .____   __________      /\   ________                    .__   __________.____       _____    _________
-  /  _  \ |    |  \______   \    / /  /  _____/___________  ______ |  |__\______   \    |     /  _  \  /   _____/
- /  /_\  \|    |   |     ___/   / /  /   \  __\_  __ \__  \ \____ \|  |  \|    |  _/    |    /  /_\  \ \_____  \
-/    |    \    |___|    |      / /   \    \_\  \  | \// __ \|  |_> >   Y  \    |   \    |___/    |    \/        \
-\____|__  /_______ \____|     / /     \______  /__|  (____  /   __/|___|  /______  /_______ \____|__  /_______  /
-        \/        \/          \/             \/           \/|__|        \/       \/        \/       \/        \/
+   _____  .____   __________
+  /  _  \ |    |  \______   \
+ /  /_\  \|    |   |     ___/
+/    |    \    |___|    |
+\____|__  /_______ \____|
+        \/        \/
 
   Copyright 2021 Huawei Technologies Co., Ltd.
 
@@ -22,6 +22,31 @@ limitations under the License.
 </pre>
 
 
+This distribution contains the C++ Algebraic Programming (ALP) framework, and
+provides the ALP/GraphBLAS, ALP/Pregel, and Sparse BLAS programming interfaces.
+Only a subset of Sparse BLAS functionality is supported, at present.
+
+This distribution contains ALP backends that generate:
+ - sequential programs,
+ - shared-memory auto-parallelised programs,
+ - nonblocking shared-memory auto-parallelised programs, and
+ - sequential programs that generate HyperDAG representations of the executed
+   ALP program.
+
+Additional backends may optionally be enabled by providing their dependences.
+Those backends generate:
+ - distributed-memory auto-parallelised programs,
+ - hybrid shared- and distributed-memory auto-parallelised programs, and
+ - sequential programs for the Banshee RISC-V Snitch Core simulator
+   (experimental).
+
+All backends perform automatically generate vectorised programs, amongst other
+automatically-applied optimisations.
+
+The ALP/GraphBLAS and ALP/Pregel interfaces are enabled for all backends, while
+the standard Sparse BLAS APIs only allow for the efficient support of the
+sequential and shared-memory parallel backends.
+
 
 # Minimal requirements
 
@@ -31,7 +56,7 @@ libraries and programs, using its `reference` and `reference_omp` backends.
 
 ## Compilation
 
-To compile ALP/GraphBLAS, you need the following tools:
+To compile ALP, you need the following tools:
 
 1. A C++11-capable compiler such as GCC 4.8.2 or higher, with OpenMP support
 2. LibNUMA development headers
@@ -40,7 +65,7 @@ To compile ALP/GraphBLAS, you need the following tools:
 (CMake's default build tool on UNIX systems) or any other supported build tool.
 
 ## Linking and run-time
-The ALP/GraphBLAS libraries link against the following libraries:
+The ALP libraries link against the following libraries:
 
 1. LibNUMA: `-lnuma`
 2. Standard math library: `-lm`
@@ -60,15 +85,15 @@ of the LPF core library and its collectives library. The LPF library has its
 further dependences, which are all summarised on the LPF project page:
 
 * [Gitee](https://gitee.com/CSL-ALP/lpf);
-* [Github](https://github.com/Algebraic-Programming/LPF).
+* [GitHub](https://github.com/Algebraic-Programming/LPF).
 
-The dependence on LPF applies to compilation, linking, and run-time. Fulfulling
+The dependence on LPF applies to compilation, linking, and run-time. Fulfilling
 the dependence enables the `bsp1d` and `hybrid` ALP/GraphBLAS backends.
 
 ## Code documentation
 
 For generating the code documentations:
-* `doyxgen` reads code comments and generates the documentation;
+* `doxygen` reads code comments and generates the documentation;
 * `graphviz` generates various diagrams for inheritance, call paths, etc.;
 * `pdflatex` is required to build the PDF file out of the Latex generated
   documentation.
@@ -76,12 +101,12 @@ For generating the code documentations:
 
 # Very quick start
 
-Here are example steps to compile and install ALP/GraphBLAS for shared-memory
-machines, without distributed-memory support. The last three commands show-case
-the compilation and execution of the `sp.cpp` example program.
+Here are example steps to compile and install ALP for shared-memory machines
+without distributed-memory support. The last three commands show-case the
+compilation and execution of the `sp.cpp` example program.
 
 ```bash
-cd <ALP/GraphBLAS root>
+cd <ALP/root/dir>
 mkdir build
 cd build
 ../bootstrap.sh --prefix=../install
@@ -101,35 +126,37 @@ In more detail, the steps to follow are:
    that `config::SIMD_SIZE::bytes` defined in that file is set correctly with
    respect to the target architecture.
 
-2. Create an empty directory for building ALP/GraphBLAS and move into it:
+2. Create an empty directory for building ALP and move into it:
    `mkdir build && cd build`.
 
-3. Invoke the `bootstrap.sh` script located inside the ALP/GraphBLAS root directory
-   `<ALP/GraphBLAS root>` to generate the build infrastructure via CMake inside the
-   current directory:
+3. Invoke the `bootstrap.sh` script located inside the ALP root directory
+   `<ALP/root/dir>` to generate the build infrastructure via CMake inside the
+   the current directory:
 
-   `<ALP/GraphBLAS root>/bootstrap.sh --prefix=</path/to/install/dir>`
+   `<ALP/root/dir>/bootstrap.sh --prefix=</path/to/install/dir>`
 
     - note: add `--with-lpf=/path/to/lpf/install/dir` if you have LPF installed
             and would like to use it.
 
-4. Issue `make -j` to compile the C++11 ALP/GraphBLAS library for the configured
-   backends.
+4. Issue `make -j` to compile the C++11 ALP library for the configured backends.
 
 5. (*Optional*) To later run all unit tests, several datasets must be made
-   available. Please run the `<ALP/GraphBLAS root>/tools/downloadDatasets.sh`
+   available. Please run the `<ALP/root/dir>/tools/downloadDatasets.sh`
    script for
 
     a. an overview of datasets required for the basic tests, as well as
 
     b. the option to automatically download them.
 
-6. (*Optional*) To make the ALP/GraphBLAS documentation, issue `make docs`. This
+6. (*Optional*) To make the ALP documentation, issue `make userdocs`. This
    generates both
 
-    a. a PDF in `<ALP/GraphBLAS build dir>/docs/code/latex/refman.pdf`, and
+    a. LaTeX in `<ALP build dir>/docs/user/latex/refman.tex`, and
+
+    b. HTML in `<ALP build dir>/docs/user/html/index.html`.
 
-    b. HTML in `<ALP/GraphBLAS build dir>/docs/code/html/index.html`.
+   To build a PDF from the LaTeX sources, cd into the directory mentioned, and
+   issue `make`.
 
 7. (*Optional*) Issue `make -j smoketests` to run a quick set of functional
    tests. Please scan the output for any failed tests.
@@ -138,20 +165,20 @@ In more detail, the steps to follow are:
    the default command lines the tests script uses are likely wrong. In this
    case, please edit `tests/parse_env.sh` by searching for the MPI
    implementation you used, and uncomment the lines directly below each
-   occurance.
+   occurrence.
 
 8. (*Optional*) Issue `make -j unittests` to run an exhaustive set of unit
    tests. Please scan the output for any failed tests.
    If you do this with LPF enabled, please edit `tests/parse_env.sh` if required
    as described in step 5.
 
-9. Issue `make -j install` to install ALP/GraphBLAS into your
-install directory configured during step 1.
+9. Issue `make -j install` to install ALP into the install directory configured
+   during step 1.
 
-10. (*Optional*) Issue `source </path/to/install/dir>/bin/setenv` to make available the
-`grbcxx` and `grbrun` compiler wrapper and runner.
+10. (*Optional*) Issue `source </path/to/install/dir>/bin/setenv` to make
+    available the `grbcxx` and `grbrun` compiler wrapper and runner.
 
-Congratulations, you are now ready for developing and integrating ALP/GraphBLAS
+Congratulations, you are now ready for developing and integrating ALP
 algorithms! Any feedback, question, problem reports are most welcome at
 
 <div align="center">
@@ -161,10 +188,12 @@ algorithms! Any feedback, question, problem reports are most welcome at
 
 # Additional Contents
 
-The remainder of this file summarises other build system targets, how to
-integrate ALP algorithms into applications, debugging, development, and,
-finally, acknowledges contributors and lists technical papers.
+The remainder of this file summarises configuration options, additional build
+system targets, how to integrate ALP programs into applications, debugging, and
+contribute to ALP development. Finally, this README acknowledges contributors
+and lists technical papers.
 
+- [Configuration](#configuration)
 - [Overview of the main Makefile targets](#overview-of-the-main-makefile-targets)
 - [Automated performance testing](#automated-performance-testing)
 - [Integrating ALP with applications](#integrating-alp-with-applications)
@@ -181,7 +210,97 @@ finally, acknowledges contributors and lists technical papers.
 - [Debugging](#debugging)
 - [Development in ALP](#development-in-alp)
 - [Acknowledgements](#acknowledgements)
-- [Citing ALP and ALP/GraphBLAS](#citing-alp-and-alpgraphblas)
+- [Citing ALP, ALP/GraphBLAS, and ALP/Pregel](#citing-alp-alpgraphblas-and-alppregel)
+
+
+# Configuration
+
+ALP employs configuration headers that contain `constexpr` settings that take
+effect every time ALP programs are compiled. Multiple object files that were
+compiled using ALP must all been compiled using the same configuration
+settings-- linking objects that have been compiled with a mixture of
+configurations are likely to incur undefined behaviour. The recommendation is
+to set a configuration before building and installing ALP, and to keep the
+installation directories read-only so that configurations remain static.
+
+There exists one main configuration file that affects all ALP backends, while
+other configuration files only affect a specific backend or only affect specific
+classes of backends. The main configuration file is found in
+`<root>/include/graphblas/base/config.hpp`, which allows one to set the
+
+1. cache line size, in bytes, within the `CACHE_LINE_SIZE` class;
+2. SIMD width, in bytes, within the `SIMD_SIZE` class;
+3. default number of experiment repetitions during benchmarking, within the
+   `BENCHMARKING` class;
+4. L1 data cache size, in bytes, within `MEMORY::big_memory` class;
+5. from which size onwards memory allocations will be reported, in log-2
+   bytes, within `MEMORY::big_memory`;
+6. index type used for row coordinates, as the `RowIndexType` typedef;
+7. index type used for column coordinates, as the `ColIndexType` typedef;
+8. type used for indexing nonzeroes, as the `NonzeroIndexType` typedef;
+9. index type used for vector coordinates, as the `VectorIndexType` typedef.
+
+Other configuration values in this file are automatically inferred, are fixed
+non-configurable settings, or are presently not used by any ALP backend.
+
+## Reference and reference_omp backends
+
+The file `include/graphblas/reference/config.hpp` contain defaults that pertain
+to the auto-vectorising and sequential `reference` backend, but also to the
+shared-memory auto-parallelising `reference_omp` backend. It allows one to set
+
+1. whether prefetching is enabled in `PREFETCHING::enabled`;
+2. the prefetch distance in `PREFETCHING::distance`;
+3. the default memory allocation strategy for thread-local data in
+   `IMPLEMENTATION::defaultAllocMode()`;
+4. same, but for shared data amongst threads in
+   `IMPLEMENTATION::sharedAllocMode()`;
+
+Modifying any of the above should be done with utmost care as it typically
+affects the defaults across an ALP installation, and *all* programs compiled
+using it. Configuration elements not mentioned here should not be touched by
+users, and rather should concern ALP developers only.
+
+## OpenMP backends
+
+The file `include/graphblas/omp/config.hpp` contains some basic configuration
+parameters that affect any OpenMP-based backend. However, the configuration
+file does not contain any other user-modifiable settings, but rather contains
+a) some utilities that OpenMP-based backends may rely on, and b) default
+that are derived from other settings described in the above. These settings
+should only be overridden with compelling and expert knowledge.
+
+## LPF backends
+
+The file `include/graphblas/bsp/config.hpp` contains some basic configuration
+parameters that affect any LPF-based backend. It includes:
+
+1. an initial maximum of LPF memory slot registrations in `LPF::regs()`;
+2. an initial maximum of LPF messages in `LPF::maxh()`.
+
+These defaults, if insufficient, will be automatically resized during execution.
+Setting these large enough will therefore chiefly prevent buffer resizes at run-
+time. Modifying these should normally not lead to significant performance
+differences.
+
+## Utilities
+
+The file `include/graphblas/utils/config.hpp` details configurations of various
+utility functions, including:
+
+1. a buffer size used during reading input files, in `PARSER::bsize()`;
+2. the block size of individual reads in `PARSER::read_bsize()`.
+
+These defaults are usually fine except when reading from SSDs, which would
+benefit of a larger `read_bsize`.
+
+## Others
+
+While there are various other configuration files (find `config.hpp`), the above
+should list all user-modifiable configuration settings of interest. The
+remainder pertain to configurations that are automatically deduced from the
+aforementioned settings, or pertain to settings that describe how to safely
+compose backends and thus only are of interest to ALP developers.
 
 
 # Overview of the main Makefile targets
@@ -190,7 +309,8 @@ The following table lists the main build targets of interest:
 
 | Target                | Explanation |
 |----------------------:|---------------------------------------------------|
-| \[*default*\]         | builds the ALP/GraphBLAS libraries and examples   |
+| \[*default*\]         | builds the ALP libraries and examples, including  |
+|                       | Sparse BLAS libraries generated by ALP            |
 | `install`             | install libraries, headers and some convenience   |
 |                       | scripts into the path set via `--prefix=<path>`   |
 | `unittests`           | builds and runs all available unit tests          |
@@ -198,7 +318,12 @@ The following table lists the main build targets of interest:
 | `perftests`           | builds and runs all available performance tests   |
 | `tests`               | builds and runs all available unit, smoke, and    |
 |                       | performance tests                                 |
-| `docs`                | builds HTML and LaTeX code and API documentation  |
+| `userdocs`            | builds HTML and LaTeX documentation corresponding |
+|                       | to the public ALP API                             |
+| `devdocs`             | builds HTML and LaTeX code documentation for      |
+|                       | developers of the ALP internals                   |
+| `docs`                | build both the user and developer code            |
+|                       | documentation                                     |
 
 For more information about the testing harness, please refer to the
 [related documentation](tests/Tests.md).
@@ -209,21 +334,20 @@ refer to the [the related documentation](docs/Build_and_test_infra.md).
 
 # Automated performance testing
 
-To check in-depth performance of this ALP/GraphBLAS implementation, issue
-`make -j perftests`. This will run several algorithms in several ALP/GraphBLAS
+To check in-depth performance of this ALP implementation, issue
+`make -j perftests`. This will run several algorithms in several ALP
 configurations. This generates three main output files:
 
-1. `<ALP/GraphBLAS build dir>/tests/performance/output`, which summarises the
-   whole run;
+1. `<ALP/build/dir>/tests/performance/output`, which summarises the whole run;
 
-2. `<ALP/GraphBLAS build dir>/tests/performance/output/benchmarks`, which
-   summarises the performance of individual algorithms; and
+2. `<ALP/build/dir>/tests/performance/output/benchmarks`, which summarises the
+   performance of individual algorithms; and
 
-3. `<ALP/GraphBLAS build dir>/tests/performance/output/scaling`, which
-   summarises operator scaling results.
+3. `<ALP/build/dir>/tests/performance/output/scaling`, which summarises operator
+   scaling results.
 
-To ensure that all tests run, please ensure all related datasets are available
-as also described at step 5 of the quick start.
+To ensure that all tests run, please ensure that all related datasets are
+available, as also described at step 5 of the quick start.
 
 With LPF enabled, please note the remark described at steps 3 and 7 of the quick
 start guide. If LPF was not configured using MPICH, please review and apply any
@@ -232,24 +356,28 @@ necessary changes to `tests/performance/performancetests.sh`.
 
 # Integrating ALP with applications
 
-There are several use cases in which ALP can be deployed and utilized, listed
-in the following. These assume that the user has installed ALP/GraphBLAS in a
-dedicated directory via `make install`.
+There are several use cases in which ALP can be deployed and utilised, listed
+in the following. These assume that the user has installed ALP in a dedicated
+directory via `make install`.
 
 ## Running ALP programs as standalone executables
 
 ### Implementation
 
 The `grb::Launcher< AUTOMATIC >` class abstracts a group of user processes that
-should collaboratively execute any single ALP/GraphBLAS program. The
-ALP/GraphBLAS program of interest must have the following signature:
-`void grb_program( const T& input_data, U& output_data )`.
+should collaboratively execute any single ALP program. The ALP program of
+interest must have the following signature:
+
+```
+void grb_program( const T& input_data, U& output_data )
+```
+
 The types `T` and `U` can be any plain-old-data (POD) type, including structs --
 these can be used to broadcast input data from the master process to all user
 processes (`input_data`) -- and for data to be sent back on exit of the parallel
-ALP/GraphBLAS program.
+ALP program.
 
-The above sending-and-receiving across processes applies only to ALP/GraphBLAS
+The above sending-and-receiving across processes applies only to ALP
 implementations and backends that support or require multiple user processes;
 both the sequential `reference` and the shared-memory parallel `reference_omp`
 backends, for example, support only one user process.
@@ -258,11 +386,11 @@ In case of multiple user processes, the overhead of the broadcasting of input
 data is linear in the number of user processes, as well as linear in the byte-
 size of `T` which hence should be kept to a minimum. A recommended use of this
 mechanism is, e.g., to broadcast input data locations; any additional I/O
-should use the parallel I/O mechanisms that ALP/GraphBLAS exposes to the ALP
-program itself.
+should use the parallel I/O mechanisms that ALP exposes to the ALP program
+itself.
 
 Output data is retrieved only from the user process with ID `0`, even if
-multiple user processes exist. Some implemenations or systems may require
+multiple user processes exist. Some implementations or systems may require
 sending back the output data to a calling process, even if there is only
 one user process. The data movement cost incurred should hence be considered
 linear in the byte size of `U`, and, similar to the input data broadcasting,
@@ -287,60 +415,67 @@ your programs using the ALP installation, the following flags are recommended:
 
 Omitting these flags for brevity, some compilation examples follow.
 
-When using the LPF-enabled hybrid shared- and distributed-memory backend of
-ALP/GraphBLAS, simply use
+When using the LPF-enabled hybrid shared- and distributed-memory ALP backends,
 
 ```bash
 grbcxx -b hybrid
 ```
-as the compiler command. To show all flags that the wrapper passes on, please use
+
+as the compiler command. To show all flags that the wrapper passes on, please
+use
 
 ```bash
 grbcxx -b hybrid --show
 ```
+
 and append your regular compilation arguments.
 
-The `hybrid` backend is capable of spawning multiple ALP/GraphBLAS user
-processes. In contrast, compilation using
+The `hybrid` backend is capable of spawning multiple ALP user processes. In
+contrast, compilation using
 
 ```bash
 grbcxx -b reference
 ```
+
 produces a sequential binary, while
 
 ```bash
 grbcxx -b reference_omp
 ```
+
 produces a shared-memory parallel binary.
 
-Note that the ALP/GraphBLAS source code never requires change while switching
-backends.
+Note that the ALP source code never requires change while switching backends.
 
 ### Linking
 
-The executable must be statically linked against an ALP/GraphBLAS library that
-is different depending on the selected backend.
+The executable must be statically linked against an ALP library that is
+different depending on the selected backend.
 The compiler wrapper `grbcxx` takes care of all link-time dependencies
 automatically.
-When using the LPF-enabled BSP1D backend to ALP/GraphBLAS, for example, simply
-use `grbcxx -b bsp1d` as the compiler/linker command.
+When using the LPF-enabled BSP1D backend to ALP, for example, simply use
+`grbcxx -b bsp1d` as the compiler/linker command.
+
 Use
 
 ```bash
 grbcxx -b bsp1d --show <your regular compilation command>
 ```
+
 to show all flags that the wrapper passes on.
 
 ### Running
 
 The resulting program has run-time dependencies that are taken care of by the
-LPF runner `lpfrun` or by the ALP/GraphBLAS runner `grbrun`.
+LPF runner `lpfrun` or by the ALP runner `grbrun`.
+
 We recommend using the latter:
 
 ```bash
 grbrun -b hybrid -np <P> </path/to/my/program>
 ```
-Here, `P` is the number of requested ALP/GraphBLAS user processes.
+
+Here, `P` is the number of requested ALP user processes.
 
 ### Threading
 
@@ -350,18 +485,18 @@ on a single node, the `reference_omp` backend may be selected instead.
 
 In both cases, make sure that during execution the `OMP_NUM_THREADS` and
 `OMP_PROC_BIND` environment variables are set appropriately on each node that
-executes ALP/GraphBLAS user process(es).
+executes ALP user process(es).
 
 ## Running parallel ALP programs from existing parallel contexts
 
 This, instead of automatically spawning a requested number of user processes,
 assumes a number of processes already exist and that we wish those processes to
-jointly execute a single parallel ALP/GraphBLAS program.
+jointly execute a single parallel ALP program.
 
 ### Implementation
 
-The binary that contains the ALP/GraphBLAS program to be executed must define
-the following global symbol with the given value:
+The binary that contains the ALP program to be executed must define the
+following global symbol with the given value:
 
 ```c++
 const int LPF_MPI_AUTO_INITIALIZE = 0
@@ -377,19 +512,19 @@ grb::Launcher< MANUAL > launcher( s, P, hostname, portname )
 ```
 
 Here, `P` is the total number of processes that should jointly execute a
-parallel ALP/GraphBLAS program, while `0 <= s < P` is a unique ID of this
-process amongst its `P`-1 siblings.
-The types of `s` and `P` are `size_t`, i.e., unsigned integers.
+parallel ALP program, while `0 <= s < P` is a unique ID of this process amongst
+its `P`-1 siblings. The types of `s` and `P` are `size_t`, i.e., unsigned
+integers.
 
 One of these processes must be selected as a connection broker prior to forming
-a group of ALP/GraphBLAS user processes. The remainder `P-1` processes must
-first connect to the chosen broker using TCP/IP connections. This choice must
-be made outside of ALP/GraphBLAS, prior to setting up the launcher, and
-materialises as the `hostname` and `portname` Launcher constructor arguments.
-The host and port name are strings, and must be equal across all processes.
+a group of ALP user processes. The remainder `P-1` processes must first connect
+to the chosen broker using TCP/IP connections. This choice must be made outside
+of ALP, prior to setting up the launcher, and materialises as the `hostname` and
+`portname` Launcher constructor arguments. The host and port name are strings,
+and must be equal across all processes.
 
 As before, and after the successful construction of a manual launcher instance,
-a parallel ALP/GraphBLAS program is launched via
+a parallel ALP program is launched via
 
 ```c++
 grb::Launcher< MANUAL >::exec( &grb_program, input, output )
@@ -398,25 +533,24 @@ grb::Launcher< MANUAL >::exec( &grb_program, input, output )
 in exactly the same way as described earlier, though with the input and output
 arguments now being passed in a one-to-one fashion:
   1. The input data is passed on from the original process to exactly one
-     corresponding ALP/GraphBLAS user process; i.e., no broadcast occurs. The
-     original process and the ALP/GraphBLAS user process are, from an operating
-     system point of view, the same process. Therefore, and additionally, input
-     no longer needs to be a plain-old-data (POD) type. Pointers, for example,
-     are now perfectly valid to pass along, and enable sharing data between the
-     original process and the ALP/GraphBLAS algorithm.
-  2. The output data is passed from each ALP/GraphBLAS user process to the
-     original process that called `Launcher< MANUAL >::exec`. To share
-     ALP/GraphBLAS vector data, it is, for example, legal to return a
-     `grb::PinnedVector< T >` as the `exec` output argument type. Doing so is
-     akin to returning a pointer to output data, and does not explicitly pack
-     nor transmit vector data.
+     corresponding ALP user process; i.e., no broadcast occurs. The original
+     process and the ALP user process are, from an operating system point of
+     view, the same process. Therefore, and additionally, input no longer needs
+     to be a plain-old-data (POD) type. Pointers, for example, are now perfectly
+     valid to pass along, and enable sharing data between the original process
+     and the ALP algorithm.
+  2. The output data is passed from each ALP user process to the original
+     process that called `Launcher< MANUAL >::exec`. To share ALP vector data,
+     it is, for example, legal to return a `grb::PinnedVector< T >` as the
+     `exec` output argument type. Doing so is akin to returning a pointer to
+     output data, and does not explicitly pack nor transmit vector data.
 
 ### Running
 
 The pre-existing process must have been started using an external mechanism.
 This mechanism must include run-time dependence information that is normally
-passed by the ALP/GraphBLAS runner whenever a distributed-memory parallel
-backend is selected.
+passed by the ALP runner whenever a distributed-memory parallel backend is
+selected.
 
 If the external mechanism by which the original processes are started allows it,
 this is most easily effected by using the standard `grbcxx` launcher while
@@ -444,14 +578,14 @@ to add ALP and ALP/GraphBLAS as a dependence to your project.
 
 # Debugging
 
-To debug an ALP/GraphBLAS program, please compile it using the sequential
-reference backend and use standard debugging tools such as `valgrind` and `gdb`.
+To debug an ALP program, please compile it using the sequential reference
+backend and use standard debugging tools such as `valgrind` and `gdb`.
 Additionally, please ensure to *not* pass the `-DNDEBUG` flag during
 compilation.
 
 If bugs appear in one backend but not another, it is likely you have found a bug
-in the former backend implementation. Please send a minimum working example that
-demonstrates the bug to the maintainers, either as an issue on or an email to:
+in the former backend. Please send a minimum working example that demonstrates
+the bug to the maintainers, either as an issue on or an email to:
   1. [GitHub](https://github.com/Algebraic-Programming/ALP/issues);
   2. [Gitee](https://gitee.com/CSL-ALP/graphblas/issues);
   3. [Albert-Jan](mailto:albertjan.yzelman@huawei.com).
@@ -459,8 +593,8 @@ demonstrates the bug to the maintainers, either as an issue on or an email to:
 
 # Development in ALP
 
-Your contributions to ALP/GraphBLAS would be most welcome. Merge or Pull Requests
-(MRs/PRs) can be contributed via Gitee and GitHub. See above for the links.
+Your contributions to ALP would be most welcome. Merge Requests (MRs) can be
+contributed via Gitee and GitHub; see above for the links.
 
 For the complete development documentation, you should start from the
 [docs/README file](docs/README.md) and the related
@@ -470,10 +604,10 @@ For the complete development documentation, you should start from the
 # Acknowledgements
 
 The LPF communications layer was primarily authored by Wijnand Suijlen, without
-whom the current ALP/GraphBLAS would not be what it is now.
+whom the current ALP would not be what it is now.
 
-The collectives library and its interface to the ALP/GraphBLAS was primarily
-authored by Jonathan M. Nash.
+The collectives library and its interface to the ALP was primarily authored by
+Jonathan M. Nash.
 
 The testing infrastructure that performs smoke, unit, and performance testing of
 sequential, shared-memory parallel, and distributed-memory parallel backends was
@@ -485,17 +619,30 @@ Computing Systems Laboratory in Zürich in particular. See the [NOTICE](NOTICE)
 file for individual contributors.
 
 
-# Citing ALP and ALP/GraphBLAS
+# Citing ALP, ALP/GraphBLAS, and ALP/Pregel
+
+If you use ALP in your work, please consider citing one or more of the following
+papers, as appropriate.
 
-If you use ALP/GraphBLAS in your work, please consider citing one or more of the
-following papers, as appropriate:
+## ALP and ALP/GraphBLAS
 
  - [A C++ GraphBLAS: specification, implementation, parallelisation, and evaluation](http://albert-jan.yzelman.net/PDFs/yzelman20.pdf)
    by A. N. Yzelman, D. Di Nardo, J. M. Nash, and W. J. Suijlen (2020).
    Pre-print.
    [Bibtex](http://albert-jan.yzelman.net/BIBs/yzelman20.bib).
- - [Nonblocking execution in GraphBLAS](http://albert-jan.yzelman.net/PDFs/mastoras22-pp.pdf)
-   by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman (2022).
-   Pre-print.
+ - [Nonblocking execution in GraphBLAS](https://ieeexplore.ieee.org/document/9835271)
+   by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman
+   in IEEE International Parallel and Distributed Processing Symposium
+   Workshops, 2022.
    [Bibtex](http://albert-jan.yzelman.net/BIBs/mastoras22.bib).
+ - [Design and implementation for nonblocking execution in GraphBLAS: tradeoffs and performance](https://dl.acm.org/doi/10.1145/3561652)
+   by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman
+   in ACM Transactions on Architecture and Code Optimization 20(1), 2023.
+   [Bibtex](http://albert-jan.yzelman.net/BIBs/mastoras22a.bib).
+
+## ALP/Pregel
+
+ - [Humble Heroes](http://albert-jan.yzelman.net/PDFs/yzelman22-pp.pdf)
+   by A. N. Yzelman (2022). Pre-print.
+   [Bibtex](http://albert-jan.yzelman.net/BIBs/yzelman22.bib).
 
diff --git a/bootstrap.sh b/bootstrap.sh
index 89b865a15..8acfdfa58 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -63,7 +63,7 @@ validate_command_result() {
 
 print_help() {
 	echo "Usage: $0 --prefix=<path> [--with-lpf[=<path>]]\
- [--with-banshee=<path>] [--with-snitch=<path>] [--no-reference] [--debug-build] [--generator=<value>] [--show] [--delete-files]"
+ [--with-banshee=<path>] [--with-snitch=<path>] [--no-reference] [--no-nonblocking] [--debug-build] [--generator=<value>] [--show] [--delete-files]"
 	echo " "
 	echo "Required arguments:"
 	echo "  --prefix=<path/to/install/directory/>"
@@ -74,6 +74,11 @@ the location where LPF is installed"
 	echo "  --with-banshee=<path/>              - path to the the tools to compile the banshee backend"
 	echo "  --with-snitch=<path/>               - path to the tools for Snitch support within the banshee backend"
 	echo "  --no-reference                      - disables the reference and reference_omp backends"
+	echo "  --no-hyperdags                      - disables the hyperdags backend"
+	echo "  --with-hyperdags-using=<backend>    - uses the given backend reference for HyperDAG generation"
+	echo "                                        optional; default value is reference"
+	echo "                                        clashes with --no-hyperdags"
+	echo "  --no-nonblocking                    - disables the nonblocking backend"
 	echo "  --debug-build                       - build the project with debug options (tests will run much slower!)"
 	echo "  --generator=<value>                 - set the generator for CMake (otherwise use CMake's default)"
 	echo "  --show                              - show generation commands instead of running them"
@@ -90,6 +95,9 @@ the location where LPF is installed"
 }
 
 reference=yes
+hyperdags=yes
+hyperdags_using=reference
+nonblocking=yes
 banshee=no
 lpf=no
 show=no
@@ -146,6 +154,16 @@ or assume default paths (--with-lpf)"
 	--no-reference)
 			reference=no
 			;;
+	--no-hyperdags)
+			hyperdags=no
+			;;
+	--with-hyperdags-using=*)
+			hyperdags=yes
+			hyperdags_using="${arg#--with-hyperdags-using=}"
+			;;
+	--no-nonblocking)
+			nonblocking=no
+			;;
 	--debug-build)
 			debug_build=yes
 			;;
@@ -202,6 +220,19 @@ if [[ "${reference}" == "yes" || "${lpf}" == "yes" ]]; then
 	check_cc_cpp_comp
 fi
 
+if [[ "${hyperdags}" == "yes" ]]; then
+	if [[ "${hyperdags_using}" != "reference" ]]; then
+		printf "Hyperdags backend requested using the ${hyperdags_using} backend, "
+		printf "but only the reference backend is supported currently."
+		exit 255
+	fi
+	if [[ "${hyperdags_using}" == "reference" && "${reference}" == "no" ]]; then
+		printf "Hyperdags backend is selected using the reference backend, "
+		printf "but the reference backend was not selected."
+		exit 255
+	fi
+fi
+
 if [[ "${lpf}" == "yes" ]]; then
 	if [[ -z "${LPF_INSTALL_PATH}" ]]; then
 		check_lpf
@@ -228,7 +259,7 @@ CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 
 # CONFIGURE CMAKE BUILDING INFRASTRUCTURE
-if [[ "${reference}" == "yes" || "${lpf}" == "yes" ]]; then
+if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" ]]; then
 	BUILD_DIR="${CURRENT_DIR}"
 
 	printf "Checking for cmake..."
@@ -287,6 +318,15 @@ the current directory before invocation or confirm the deletion of its content w
 	if [[ "${reference}" == "no" ]]; then
 		CMAKE_OPTS+=" -DWITH_REFERENCE_BACKEND=OFF -DWITH_OMP_BACKEND=OFF"
 	fi
+	if [[ "${hyperdags}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_HYPERDAGS_BACKEND=OFF"
+	fi
+	if [[ "${hyperdags}" == "yes" ]]; then
+		CMAKE_OPTS+=" -DWITH_HYPERDAGS_USING=${hyperdags_using}"
+	fi
+	if [[ "${nonblocking}" == "no" ]]; then
+		CMAKE_OPTS+=" -DWITH_NONBLOCKING_BACKEND=OFF"
+	fi
 	if [[ "${lpf}" == "yes" ]]; then
 		CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'"
 	fi
diff --git a/changelog.md b/changelog.md
index 72aa3e4d6..3a77e6b5e 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,128 @@
 
+Version 0.7.0
+=============
+
+This is a summary of changes. For full details, see the publicly available Git
+history prior to the v0.7 tag.
+
+Highlights:
+
+ 1. This release re-implements the nonblocking ALP/GraphBLAS backend by Mastoras
+    et al. (GrAPL/IPDPSW '22, TACO '23) on the latest ALP code base. The use of
+    the nonblocking backend for some algorithms results in multiple-factor
+    speedups versus standard blocking execution as well as versus external
+    industry-standard frameworks. This includes Eigen, which, like nonblocking
+    ALP/GraphBLAS, perform cross-operation fusion. Simply compile your ALP
+    programs using `grbcxx -b nonblocking`, and enjoy the speedups!
+
+ 2. We also introduce a new programming interface to the ALP software stack that
+    allows vertex-centric programming in addition to programming using
+    generalised sparse linear algebra. This new interface, ALP/Pregel,
+    translates vertex-centric programs to standard ALP/GraphBLAS primitives
+    during compilation, and thus benefits of all automatic optimisations
+    included with the ALP software stack.
+
+ 3. Support for software prefetching during `vxm` and `mxv` has been added to
+    the `reference` and `reference_omp` backends. Since optimal prefetch
+    settings and its overall effectiveness relies strongly on 1) the structure
+    of the sparse matrices and graphs considered as well as on 2) the algorithms
+    used on those data, this new feature is turned off by default. To use it,
+    please enable it via `include/graphblas/reference/config.hpp` and tune the
+    there-defined prefetch distances.
+
+ 4. Finally, this release includes another new backend, the `hyperdags` backend.
+    A program compiled with this backend will, after execution, dump a HyperDAG
+    representation of the ALP computations that the program executed.
+
+Changes to the specification:
+
+ 1. Any ALP primitive with ALP container output now takes a Phase argument.
+
+ 2. Clarify that the use of the `dense` descriptor also implies that the output
+    containers on entry must be dense. This applies also for out-of-place
+    primitives.
+
+Algorithms:
+ - [new] a vertex-centric PageRank-like algorithm implemented on top of the new
+   ALP/Pregel has been added;
+ - [new] a vertex-centric algorithm for strongly connected components on
+   undirected graphs implemented on top of ALP/Pregel has been added;
+ - [new] the algebraic k-core decomposition algorithm by Li et al. (HPEC '21)
+   has been added;
+ - [bug] the mpv algorithm performed one too many iterations, while all
+   associated tests used an ALP/GraphBLAS baseline-- v0.7 now instead verifies
+   against external ground truths;
+ - [bug] the label propagation algorithm relied on a bugged implementation of
+   `grb::set`, now fixed, while it now and when possible relies on `std::swap`
+   instead of performing explicit and expensive copies;
+ - [bug] the CG algorithm returned `SUCCESS` even it failed to converge within
+   the given number of maximum iterations.
+
+Operators:
+ - [new] v0.7 (re-)introduces the four less-than(-or-equal) and
+   greater-than(-or-equal) operators;
+
+All backends:
+ - [bug] fixed the behaviour of ALP containers under copy-assignment and
+   copy-construction;
+ - [bug] all variants of `foldl` and `foldr` previously could erroneously return
+   `ILLEGAL` in the presence of sparse vectors and/or masks;
+ - [bug] several primitives would not return `ILLEGAL` in the presence of the
+   `dense` descriptor when faced with sparse containers;
+ - [bug] all backends missed the implementation of at least one `eWiseMul`
+   variant;
+ - [bug] all backends missed the implementation of at least two `eWiseApply`
+   variants where both inputs are scalar;
+ - [feature] improved `_DEBUG` tracing and code style throughout.
+
+Reference and reference_omp backends:
+ - [bug] overlap detection of the output and output mask was erroneously
+   disabled for the `vxm` and `mxv` primitives, herewith fixed;
+ - [bug] `foldl` and `foldr` previously have employed unexpected casting
+   behaviour;
+ - [bug] multiple copy-assignment of the same vector could fail;
+ - [bug] the vector<-scalar<-vector `eWiseApply` using operators was in-place;
+ - [bug] the `eWiseApply` using sparse vector inputs and/or masks could in some
+   rare cases depending on structure and vector lengths generate incorrect
+   output;
+ - [bug] the implementation of the vector `grb::set` where the output container
+   was not already dense was in-place, while out-of-place semantics were
+   defined;
+ - [bug] the output-masked `eWiseMul` was bugged in the case where one of the
+   inputs was scalar;
+ - [bug] matrix containers with initial requested capacity zero could attempt
+   to access uninitialised memory, including even after a successful subsequent
+   `resize`;
+ - [performance] `foldl` and `foldr` using sparse vectors and/or masks were
+   previously not always following asymptotically optimal behaviour;
+ - [performance] `set` previously did not exploit information such as whether
+   the `dense` descriptor was present, whether vectors need only touch
+   coordinate data to generate correct output, or whether it never needs to
+   touch coordinate data;
+ - [performance] `eWiseApply` detects more cases of trivial operations on empty
+   vectors, and completes those faster;
+ - [performance] optimised `eWiseMul` with scalar inputs.
+
+BSP1D and hybrid backends:
+ - [bug] the output-masked `vxm` and various `foldl` and `foldr` were missing;
+ - [bug] copy-assignment operator for vectors was missing.
+
+Testing, development, and documentation:
+ - the unit test suite has been hardened to detect all aforementioned bugs;
+ - outdated documentation was revised-- in particular, all user-facing
+   documentation has been checked and can now be generated via the new make
+   target `make userdocs`;
+ - developer documentation is now built via `make devdocs`, while the older
+   `make docs` target now builds both the user and developer documentation;
+ - new developers can now enjoy an updated developer guide;
+ - the test suite now prints an error when the automatic detection of the number
+   of sockets fails, and then auto-selects one instead of zero (which caused the
+   test scripts to fail);
+ - added performance tests for the sparse matrix--vector, sparse matrix--sparse
+   vector, and sparse matrix--sparse matrix multiplication kernels;
+ - improved both the GitHub and internal CI scripts.
+
+
 Version 0.6.0
 =============
 
@@ -8,7 +132,7 @@ history prior to the v0.6 tag.
 Highlights and changes to the specification:
  - Deprecated `grb::init` and `grb::finalize` in favour of grb::Launcher.
    Existing code should migrate to using the Launcher as any later release may
-   remove the now-deprecated primtives.
+   remove the now-deprecated primitives.
  - If you wish to rely on ALP/GraphBLAS for more standard sparse linear
    algebra but if you cannot, or do not wish to, adapt your existing sources
    to the C++ ALP/GraphBLAS API, then v0.6 onwards generates libraries that
@@ -70,7 +194,7 @@ Reference and reference_omp backends:
    properly updated.
  - Bugfix: the OpenMP `schedule( static, chunk_size )` has a dynamic (run-time)
    component that was not intended.
- - Bugifx: some OpenMP `schedule( dynamic, chunk_size )` operate on regular
+ - Bugfix: some OpenMP `schedule( dynamic, chunk_size )` operate on regular
    loops and should employ a static schedule instead.
 
 BSP1D backend:
@@ -198,7 +322,7 @@ BSP1D and hybrid backends:
    declared as part of BSP1D friend declarations. Curiously, many compilers
    accepted the previous erroneous code.
  - Bugfix: empty BSP1D containers could previously leave process-local matrices
-   unitialised.
+   uninitialised.
 
 Reference and reference_omp backends:
  - Bugfix: matrix construction did not use the `alloc.hpp` mechanisms. This
@@ -207,7 +331,7 @@ Reference and reference_omp backends:
 
 All backends:
  - Bugfix: `grb::Launcher` (as well as the benchmarker) did not always properly
-   finalize the ALP/GraphBLAS context after exec completed. This caused some
+   finalise the ALP/GraphBLAS context after exec completed. This caused some
    memory to not be properly freed on program exits.
  - Bugfix: the out-of-place versions of `grb::operators::{argmin,argmax}` were
    incorrect. All code within the repository was unaffected by this bug. The
@@ -224,7 +348,7 @@ Version 0.4.1
  - The CG algorithm assumed out-of-place behaviour of grb::dot, while the
    specification since v0.1 defines it to be in-place. Implementations of
    grb::dot were erroneously out-of-place until v0.4, but the CG algorithm
-   was errouneously not updated. This hotfix rectifies this.
+   was erroneously not updated. This hotfix rectifies this.
 
 
 Version 0.4.0
@@ -276,36 +400,46 @@ Version 0.3.0
 =============
 
 Reference and reference_omp backends:
- - Fixed issue where grb::set, grb::vxm, and grb::mxv could fail for more exotic data types.
- - Fixed issue that prevented std::move on matrices, both from assignment and construction.
+ - Fixed issue where grb::set, grb::vxm, and grb::mxv could fail for more
+   exotic data types.
+ - Fixed issue that prevented std::move on matrices, both from assignment and
+   construction.
  - Optimised masked grb::set to now reach optimal complexity in all cases.
  - Optimised grb::eWiseLambda over matrices to avoid atomics.
 
 BSP1D backend:
- - Fixed issue where iterating over empty matrices could fail in the BSP1D backend.
- - Fixed issue in BSP1D backend that caused dynamic allocations where they were not allowed.
- - Fixed issue where the automatic-mode launcher and benchmarker could, in rare cases, fail.
+ - Fixed issue where iterating over empty matrices could fail in the BSP1D
+   backend.
+ - Fixed issue in BSP1D backend that caused dynamic allocations where they were
+   not allowed.
+ - Fixed issue where the automatic-mode launcher and benchmarker could, in rare
+   cases, fail.
  - Fixed issue where, under rare conditions, the stack-based combine could fail.
- - Fixed performance bug in the BSP1D backend causing spurious calls to lpf_sync.
+ - Fixed performance bug in the BSP1D backend causing spurious calls to
+   lpf_sync.
 
 Level-3 functionality, all backends:
  - Fixed issue where a masked set-to-value on matrices would fail.
- - Fixed issue where mxm could work with unitialised values when more exotic semirings are used.
- - Fixed issue that prevented std::move on matrices, both from assignment and construction.
+ - Fixed issue where mxm could work with uninitialised values when more exotic
+   semirings are used.
+ - Fixed issue that prevented std::move on matrices, both from assignment and
+   construction.
  - New level-3 function: eWiseApply.
 
 (Note that the interface of level-3 functionality remains experimental.)
 
 Algorithms and utilities:
- - Fixed issue where MatrixFileReader would store unitialised values when reading pattern matrices.
+ - Fixed issue where MatrixFileReader would store uninitialised values when
+   reading pattern matrices.
  - Updated the sparse neural network inference algorithm.
  - New algorithm added: spy.
 
 Others:
  - Fixed issue where a `make clean` would miss some object files.
- - Added new unit and performance tests, including those for detecting the above-described bug
-   fixes and added functionality.
- - Documentation update in line with the upcoming revision of the C++ GraphBLAS paper.
+ - Added new unit and performance tests, including those for detecting the
+   above-described bug fixes and added functionality.
+ - Documentation update in line with the upcoming revision of the C++ GraphBLAS
+   paper.
  - Added some missing documentation.
  - Code style fixes and some dead code removal.
 
@@ -313,7 +447,8 @@ Others:
 Version 0.2.0
 =============
 
-Fix some issues in the Banshee backend that appeared after refactoring for the 0.1.0 release.
+Fix some issues in the Banshee backend that appeared after refactoring for the
+0.1.0 release.
 
 Removes --deps option from ./configure as it was no longer used.
 
diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake
index f4b254b8f..94bd58f31 100644
--- a/cmake/AddGRBInstall.cmake
+++ b/cmake/AddGRBInstall.cmake
@@ -18,8 +18,8 @@
 # defines variables for the creation of wrapper scripts and the installation
 #
 
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_BSP1D_BACKEND
-	WITH_HYBRID_BACKEND WITH_NUMA
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_NONBLOCKING_BACKEND
+	WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND WITH_NUMA
 )
 assert_valid_variables( CMAKE_INSTALL_PREFIX AVAILABLE_BACKENDS CMAKE_CXX_COMPILER )
 
@@ -44,6 +44,7 @@ install( EXPORT GraphBLASTargets
 # paths where to install the binaries of the various backends
 set( ALP_UTILS_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}" )
 set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" )
+set( HYPERDAGS_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hyperdags" )
 set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" )
 set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" )
 
@@ -112,7 +113,7 @@ endif()
 # paths may have spaces, hence wrap them inside single quotes ''
 
 # shared memory backends
-if ( WITH_REFERENCE_BACKEND )
+if( WITH_REFERENCE_BACKEND )
 	addBackendWrapperGenOptions( "reference"
 		COMPILE_DEFINITIONS "${REFERENCE_SELECTION_DEFS}"
 		LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
@@ -128,6 +129,23 @@ if( WITH_OMP_BACKEND )
 	)
 endif()
 
+# dependent backends
+if( WITH_HYPERDAGS_BACKEND )
+	addBackendWrapperGenOptions( "hyperdags"
+		COMPILE_DEFINITIONS "${HYPERDAGS_SELECTION_DEFS};${HYPERDAGS_INCLUDE_DEFS}"
+		LINK_FLAGS "'${HYPERDAGS_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+		"'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+	)
+endif()
+
+if( WITH_NONBLOCKING_BACKEND )
+	addBackendWrapperGenOptions( "nonblocking"
+		COMPILE_DEFINITIONS "${NONBLOCKING_SELECTION_DEFS};${NONBLOCKING_INCLUDE_DEFS}"
+		LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+			"'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+	)
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	assert_valid_variables( LPFRUN LPFCPP )
diff --git a/cmake/AddGRBTests.cmake b/cmake/AddGRBTests.cmake
index d05be44c8..cec04eb68 100644
--- a/cmake/AddGRBTests.cmake
+++ b/cmake/AddGRBTests.cmake
@@ -31,9 +31,6 @@ assert_valid_variables( ALL_BACKENDS AVAILABLE_BACKENDS TEST_CATEGORIES
 
 # create variables to store tests against each backend
 foreach( b ${AVAILABLE_BACKENDS} )
-	if( NOT TARGET "backend_${b}" )
-		message( FATAL_ERROR "Needed target backend_${b} does not exist!" )
-	endif()
 	define_property( GLOBAL PROPERTY tests_backend_${b} BRIEF_DOCS "${b} tests" FULL_DOCS "tests for backend ${b}" )
 endforeach()
 
diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake
index 2b1bc012b..fab0f9ac9 100644
--- a/cmake/AddGRBVars.cmake
+++ b/cmake/AddGRBVars.cmake
@@ -21,8 +21,8 @@
 # to add a new backend, add your own to each ### SECTION
 #
 
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_BSP1D_BACKEND
-	WITH_HYBRID_BACKEND WITH_NUMA
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_NONBLOCKING_BACKEND
+	WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
 )
 
 ### STANDARD TARGET NAMES
@@ -31,18 +31,26 @@ set( REFERENCE_BACKEND_DEFAULT_NAME "backend_reference" )
 set( REFERENCE_OMP_BACKEND_DEFAULT_NAME "backend_reference_omp" )
 set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" )
 set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" )
-
+set( HYPERDAGS_BACKEND_DEFAULT_NAME "backend_hyperdags" )
+set( NONBLOCKING_BACKEND_DEFAULT_NAME "backend_nonblocking" )
 
 ### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION
 
 # compiler definitions to include backend headers
 set( REFERENCE_INCLUDE_DEFS "_GRB_WITH_REFERENCE" )
 set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" )
+set( HYPERDAGS_INCLUDE_DEFS "_GRB_WITH_HYPERDAGS" )
+set( NONBLOCKING_INCLUDE_DEFS "_GRB_WITH_NONBLOCKING" )
 set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" )
 
 # compiler definitions to select a backend
 set( REFERENCE_SELECTION_DEFS "_GRB_BACKEND=reference" )
 set( REFERENCE_OMP_SELECTION_DEFS "_GRB_BACKEND=reference_omp" )
+set( HYPERDAGS_SELECTION_DEFS
+	"_GRB_BACKEND=hyperdags"
+	"_GRB_WITH_HYPERDAGS_USING=${WITH_HYPERDAGS_USING}"
+)
+set( NONBLOCKING_SELECTION_DEFS "_GRB_BACKEND=nonblocking" )
 set( BSP1D_SELECTION_DEFS
 		"_GRB_BACKEND=BSP1D"
 		"_GRB_BSP1D_BACKEND=reference"
@@ -56,8 +64,7 @@ set( HYBRID_SELECTION_DEFS
 set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" )
 
 ### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER
-set( ALL_BACKENDS "reference" "reference_omp" "bsp1d" "hybrid" )
-
+set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" )
 
 # list of user-enabled backends, for tests and wrapper scripts (do not change!)
 set( AVAILABLE_BACKENDS "" )
@@ -66,7 +73,7 @@ set( AVAILABLE_BACKENDS "" )
 # backends that are enabled by the user: append as in the following
 
 # shared memory backends
-if ( WITH_REFERENCE_BACKEND )
+if( WITH_REFERENCE_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "reference" )
 endif()
 
@@ -74,6 +81,15 @@ if( WITH_OMP_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "reference_omp" )
 endif()
 
+# dependent backends
+if( WITH_HYPERDAGS_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "hyperdags" )
+endif()
+
+if( WITH_NONBLOCKING_BACKEND )
+	list( APPEND AVAILABLE_BACKENDS "nonblocking" )
+endif()
+
 # distributed memory backends
 if( WITH_BSP1D_BACKEND )
 	list( APPEND AVAILABLE_BACKENDS "bsp1d" )
diff --git a/docs/Build_and_test_infra.md b/docs/Build_and_test_infra.md
index 98b144fc1..e751cb0bd 100644
--- a/docs/Build_and_test_infra.md
+++ b/docs/Build_and_test_infra.md
@@ -534,7 +534,9 @@ which may be set via a variable like
 set( EXAMPLE_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/example" )
 ```
 
-used in the following steps.
+used in the following steps. The same binary file may implement multiple
+backends. For example, both the reference and the OMP backend share
+the same binary file, i.e., the one generated for shared memory backends.
 
 For convenience, the macro `addBackendWrapperGenOptions` is provided to
 automatically generate the necessary variables according to the internal naming
diff --git a/docs/Development.md b/docs/Development.md
index 5bdb5af28..cfe72d5a1 100644
--- a/docs/Development.md
+++ b/docs/Development.md
@@ -15,30 +15,221 @@ See the License for the specific language governing permissions and
 limitations under the License.
 </pre>
 
-# Development of ALP/GraphBLAS
 
-This document introduces the reader to the development of ALP/GraphBLAS.
+# ALP Development Style Guide
 
-ALP/GraphBLAS is written in C++11 and is mainly composed of header files with
-largely templated data structures and operations. This allows both
+This document introduces the reader to the development style of ALP.
+
+ALP is written in C++11 and is mainly composed of header files with largely
+templated data structures and operations. This allows both
 
 1. strict compile-time checking of the data types and of the algebraic
 abstractions (typically encoded as template parameters: see the
-[Semiring class](include/graphblas/semiring.hpp) for an example)
-2. specialized code generation, increasing performance
-
-## Code style tools and guidelines
-ALP/GraphBLAS follows certain code style rules in order to ensure readability
-and uniformity.
-
-To apply these rules, the directory `tools` contains the script
-`clang-format-linter.sh` to format (*lint*, in Unix jargon) the code
-accordingly, based on the `clang-format` tool.
-Version 11 or higher is requested for the settings to be applied; if you want to
-use a different version, you can alias it in Bash before invoking
-`tools/clang-format-linter.sh`, which directly calls the command
+[Semiring class](../include/graphblas/semiring.hpp) for an example);
+
+2. specialised code generation, increasing performance.
+
+Common patterns include [SFINAE](https://de.wikipedia.org/wiki/Substitution_failure_is_not_an_error)
+and in particular its combination with (algebraic) type traits, as well as
+copious use of `static_assert` and `constexpr`. The choice of ANSI C++11 is to
+balance the benefits of these more modern C++ constructs with the typical
+reluctance of applying the latest and greatest in software development tooling
+within production codes.
+
+Given that this is a template library, there are both rigid code styles as well
+as more rigid coding patterns to ensure the overall quality of the template
+library-- these are detailed in their respective sections. This document also
+includes a brief description of code style tools included with the repository,
+as well as a section on the use of the available build and test infrastructure.
+
+First, however, this section concludes with some brief comments on the overall
+code structure.
+
+## Encapsulation
+
+Template code that should not be exposed to ALP programmers (i.e., users of the
+ALP programming interface) should be encapsulated in an internal namespace such
+as, e.g., `grb::internal`. Non-templated code that should not be exposed to ALP
+programmers should be defined within `.cpp` files. Only functionality that is
+called by templated code should be exported during compilation of the ALP
+libraries that ALP programmers would link against. All code that may be used by
+ALP programmers should be documented thoroughly.
+
+## Utilities
+
+Utility functions that could be useful by ALP programmers and not just by ALP
+developers, should unambiguously be housed in the `include/graphblas/utils`
+directory, with the interfaces made available through the corresponding
+`grb::utils` namespace. These functionalities should therefore and ideally *not*
+be included in an internal namespace.
+
+## Test utilities
+
+Utility functions that are *only* useful for ALP unit, smoke, and/or performance
+tests should unambiguously be housed in the `tests/utils` directory. It should
+never be included with code functionalities for ALP programmers. These
+functionalities should never be included with the template library, neither as a
+header that could be invoked by ALP programmers, nor within an internal
+namespace or within an internal `.cpp` file.
+
+
+# Code style guidelines
+
+ALP follows certain code style rules in order to ensure readability and
+uniformity. An informal summary of the main points follows:
+
+1. alignment uses **spaces** while indentation uses **tabs**;
+
+2. indentation is increased after a line break that does not end with `;`,
+   increased after a line break with an unterminated `<`, `(` or `{` and
+   decreased after matching `;`, `>`, `)`, and `}`. Opening and closing
+   delimiters are the last, resp., first characters on every line-- i.e., the
+   commonly accepted indentation pattern;
+
+3. none of `;`, `<`, `(`, `{` should appear alone on a single line-- while if
+   the opening delimiters like `<` follows a keyword it should do so
+   immediately, without intermediate spaces;
+
+4. when a closing delimiter is far (in a vertical space sense) from its opening
+   pair, it should be followed by a comment that documents what it closes;
+
+5. keywords that induce indentation include `private:`, `protected:`, and
+   `public:`, which furthermore do not induce intermediate spaces between the
+   keyword and the `:`;
+
+6. indentation of pre-processor code (macros) uses spaces, not tabs, and ignores
+   tab-based indentation;
+
+7. a single line has maximum length of about 80 characters, not including
+   indentation, and never ends with white spaces (space characters or tab
+   characters);
+
+8. use spaces and parentheses liberally for increasing code readability and to
+   limit ambiguity, including for if-else blocks or for-loop blocks that consist
+   only of one (or an otherwise limited number of lines);
+
+9. files always end with an empty line, and includes two empty lines before
+   implementation starts (i.e., two empty lines after any comments, macro
+   guards, and includes before the first line of code);
+
+10. Classes and types use the CamelCase naming format, variables of any kind
+    (static, constexpr, global, or members) use camelCase, while constants of
+    any kind (static const, global const, constexpr const, etc.) use CAMELCASE.
+    Names shall furthermore be both self-descriptive and short. Namespaces are
+    camelcase.
+
+As the saying goes, exceptions prove the rules. For example, rule #3 could be
+viewed as a specific exception to rule #8. Exceptions that are not
+self-contained in the above set include:
+
+1. one long program line under rule #7 may be arbitrarily spread over two lines
+   even if it runs counter rule #3-- but not if it would spread over more than
+   two lines;
+
+2. OpenMP pragmas and compiler warning suppressions may ignore rule #6-- they
+   may follow regular tab-based indentation instead;
+
+3. the 80-character limit is not strictly enforced. For example, an OpenMP macro
+   of 83 characters on a single line is better readable than when split over
+   two;
+
+4. brackets in code bodies that limit the scope of some of the declaration
+   within the body, may, contrary to rule #3, appear alone on a single line.
+
+
+## Code style by examples:
+
+- `if( ... ) {`, not `if (...) {` or any other variant;
+
+- lines should never end with white space (tab or space characters);
+
+- `if( x == 5 ) {` instead of `if( x==5 ) {`;
+
+- only write `<<` or `>>` when doing bit shifts, never for nested templates;
+
+- the following is correct. It would *not* be correct to put the whole block on
+  a single line, nor would it be correct to write it without any curly brackets;
+
+```c++
+if( ... ) {
+	return SUCCESS;
+}
+```
+
+- the following is correct w.r.t. vertical spacing;
+
+```c++
+/*
+ * copyright info
+ */
+
+/**
+ * @file
+ *
+ * File documentation
+ *
+ * @author Author information
+ * @date   Date of initial creation
+ */
+
+#ifndef MACRO_GUARD
+#define MACRO_GUARD
+
+// note that two empty lines follow:
+
+
+namespace alp {
+
+	// ...
+
+}
+
+#endif
+
+// note that one empty line follows:
+
+```
+
+- encapsulation using curly bracket delimiters that both appear on a single
+  line:
+
+```c++
+void f( ... ) {
+	// some code block dubbed "A"
+	// ...
+	// end code block A
+	size_t ret;
+	{
+		// some code block with ields and containers that are used *solely* for
+		// for computing ret
+		// ...
+		ret = ...;
+	}
+	// some code that uses ret as well as fields, containers, and anything else
+	// that was defined in code block A
+}
+```
+
+
+# Code style tools
+
+There currently exist two tools to help check developer's code styles: the Clang
+linter script `clang-format-linter.sh`, and the `detectSuspiciousSpacing.sh`
+script.
+
+## Clang linter
+
+To automatically and approximately correctly check whether code style rules are
+followed properly, the directory `tools` contains the script
+`clang-format-linter.sh` that formats (*lints*, in Unix jargon) the source code,
+based on the `clang-format` tool.
+
+Version 11 or higher of the tool is required. If you want to use a different
+version, you can alias it in Bash before invoking
+`tools/clang-format-linter.sh`, which otherwise directly calls the command
 `clang-format-11`.
-This tools is available in the standard repositories of the main Linux
+
+This tools is available in the standard repositories of all main Linux
 distributions: for example, in Ubuntu you can install it with
 `apt-get install clang-format-11`.
 
@@ -47,7 +238,8 @@ To list the script parameters, simply type
 ```bash
 tools/clang-format-linter.sh -h
 ```
-For example, to lint the file `tests/add15d.cpp` and see the lint'ed code on the
+
+For example, to lint the file `tests/add15d.cpp` and see the linted code on the
 standard output, type
 
 ```bash
@@ -66,55 +258,127 @@ Instead, to lint the whole ALP/GraphBLAS code-base in-place, type
 tools/clang-format-linter.sh -i --lint-whole-grb
 ```
 
-The style rules enforced by the tool are
-
-- [x] lines are max 200 characters long, which means the line size is pretty
-liberal to avoid weird re-flows
-- [x] indents should be *tabs*, not spaces
-- [x] alignment should be done using spaces, not tabs
-- [x] essentially any line that ends in `{`, `(`, or whatever increases the
-current number of indents by one and vice versa
-- [x] argument lists (including template arguments) longer than 80 chars should
-be broken over multiple lines
-- [x] `if( `, not `if (` (also for `for`, etc.)
-- [x] no lines with indents and curly brackets only: put curly brackets on the
-same line as what starts that code block instead (only exception: code blocks
-that are not started by standard C++ key words, but e.g. required pragmas
-instead)
-- [x] no lines ending with spaces
-- [x] `#ifdef`, `#else`, `#endif` etc are never indented.
-- [x] comment blocks are capped at 80 chars per line
-- [x] include lines primarily ordered by
-  1. standard includes
-  2. external libraries
-  3. internal headers/files
-
-The following rules are also mandated, but cannot currently be applied via
-`clang-format`; however, developers should abide by the following guidelines as
-well:
-
-* files should end with an empty line
-* no `if`, `for`, `while`, or any other control structure without curly
-* brackets, even if what follows is a single statement
-* OpenMP pragmas (or any pragma) are indented as regular code
-* nested `ifdef`s etc. in close proximity of one another are indented by spaces
-
-The following guidelines are not strictly requested nor enforced, but are
-suggested to ensure readability and uniformity:
-
-* be gratuitous with spaces and parenthesis: anything that could possibly be
-construed as confusing or ambiguous should be clarified with spaces and
-parentheses if that removes (some of the) possible confusion or ambiguity
-* in particular, whenever it is legal to put one or more spaces, put one
-(e.g., `if( x == 5 )` instead of `if( x==5 )`)
-* in particular, only write `<<` or `>>` when doing bit shifts, not when
-performing template magic
-* when closing a block (either `#endif` or `}`) and the block was long (whatever
-long may be), add a comment on what it is that is being closed
-* all functions should have `doxygen`-friendly documentation
-* minimise the use of pre-processor macros (use C++11 `constexpr` instead)
-
-## Building and Testing infrastructure
+### Warning
+
+This tool is only approximately correct in terms of the code style described
+above(!)
+
+
+## Automated detection of suspicious spacing
+
+Many code reviews have exposed erroneous use of spaces, primarily due to editors
+attempting to be helpful in automatically replicating code styles like
+indentations. Before committing code, a careful submitter may opt to execute
+something like the following:
+
+```
+# go into a source directory where you have committed changes
+$ cd include/graphblas/nonblocking
+# **from within that directory** execute the helper script:
+$ ../../../tools/detectSuspiciousSpacing.sh
+```
+
+If all is OK, the output of the above would print the following to the standard
+output stream (which also immediately documents which patterns the script is
+tailored to detect):
+
+```
+Detecting suspicious spacing errors in the current directory, /path/to/source/include/graphblas/nonblocking
+	 spaces, followed by end-of-line...
+	 tabs, followed by end-of-line...
+	 spaces followed by a tab...
+$
+```
+
+Seeing no `grep` output between the noted patterns (or between the last noted
+pattern and the prompt) means that no such patterns have been found within any
+source file in the current directory, including source files in a subdirectory
+to the current path.
+
+
+# Coding patterns for general code quality
+
+Some major coding rules for maintaining high code quality include:
+
+1. files always display the copyright and license header, and documents the
+   initial author information and date of file creation;
+
+2. limit the use of macros and in particular, never leak macro definitions to
+   user code;
 
+3. do not use `using` in a way that leaks to user code-- in particular,
+   never use it in headers;
+
+4. separate includes by their source -- e.g., a group of STL includes followed
+   by a group of internal utility header includes, and so on;
+
+5. code documentation uses [doxygen](https://www.doxygen.nl/) format, and in
+   particular the [Javadoc](https://www.doxygen.nl/manual/docblocks.html#cppblock)
+   style;
+
+6. use `constexpr` fields or functions in favour of any pre-processor macros,
+   and avoid global constants, especially those that leak to user code;
+
+7. performance parameters are never hardcoded but instead embedded (and
+   documented!) into the applicable `config.hpp` file.
+
+
+# Building and Testing infrastructure
+
+To use the build and test infrastructure, see the [main README](../README.md).
 To modify it, you should refer to the
 [dedicated documentation](Build_and_test_infra.md).
+
+
+## Testing before committing
+
+A careful committer may wish to run smoke or unit tests before committing to the
+main repository. Such developers may wish to take note of the script contained
+in the tests directory, `tests/summarise.sh`, which may be used to quickly
+analyse a test log file: it summarises how many tests have passed, how many have
+been skipped, and how many have failed.
+
+Additionally, if at least one test has failed, or if none of the tests have
+succeeded (indicating perhaps a build error), then the entire log will be
+`cat`-ted.
+
+A common use is to, in one terminal, execute:
+
+```bash
+$ cd build
+$ make -j88 smoketests &> smoketests.log
+```
+
+While in another, and while the above command is running, to execute:
+
+```bash
+$ cd build
+$ watch ../tests/summarise.sh smoketests.log
+```
+
+The second terminal then gives ``live'' feedback on the progress of the tests.
+
+## Continuous integration
+
+GitHub actions have been deployed to run smoke tests using both performance and
+debug flags. These tests are run on standard images that do not include the
+the datasets that some smoke tests require -- those tests are hence skipped.
+
+An internal CI to the Computing Systems Lab at the Huawei Zurich Research Center
+exists, but can only be triggered by its employees. This CI also performs unit
+tests, in addition to smoke tests. At present, however, it too does *not* employ
+images that have the required dataset embedded or accessible.
+
+The `develop` and `master` branches are tested by the internal CI on a regular
+schedule, in addition to being triggered on every push, and run a more
+comprehensive combination of test suites and compilation (debug/release) flags.
+Also release candidate branches (i.e., branches with names that match the
+wild-card expression `*-rc*`) are subject to the same more extensive test suite.
+
+All CI tests at present skip tests that require data sets, and therefore
+developers are suggested to not skip running local tests manually, at least once
+before flagging a merge request as ready and requesting a review. Even if at
+some point the CI does provide data sets, the practice of developers
+self-checking MRs is recommended as it naturally also induces greater robustness
+across compilers and distributions.
+
diff --git a/docs/Nonblocking_backend.md b/docs/Nonblocking_backend.md
new file mode 100644
index 000000000..f791b36d0
--- /dev/null
+++ b/docs/Nonblocking_backend.md
@@ -0,0 +1,921 @@
+
+<pre>
+  Copyright 2021 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+</pre>
+
+
+# Design and implementation of the nonblocking backend
+
+The [C API specification](https://graphblas.org/docs/GraphBLAS_API_C_v1.3.0.pdf) of [GraphBLAS](https://graphblas.org) defines two execution modes: blocking execution and nonblocking execution. In the blocking mode, the invocation of an operation implies that the computation is completed and the result is written to memory when the function returns. The nonblocking execution allows an operation to return although the result has not been computed yet. Therefore, the nonblocking execution may delay the execution of some operations to perform optimisations. Lazy evaluation is the key idea in nonblocking execution, and computations are performed only when they are required for the sound execution of a program.
+
+For the description of the full design and experimental results for nonblocking execution in ALP/GraphBLAS, please read the following publications.
+
+* A. Mastoras, S. Anagnostidis, and A. N. Yzelman, "Design and Implementation for Nonblocking Execution in GraphBLAS: Tradeoffs and Performance," ACM Trans. Archit. Code Optim. 20, 1, Article 6 (March 2023), 23 pages, [https://doi.org/10.1145/3561652](https://doi.org/10.1145/3561652)
+* A. Mastoras, S. Anagnostidis, and A. N. Yzelman, "Nonblocking execution in GraphBLAS," 2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), 2022, pp. 230-233, doi: [10.1109/IPDPSW55747.2022.00051](10.1109/IPDPSW55747.2022.00051).
+
+ALP/GraphBLAS provides the `nonblocking` backend that performs multi-threaded nonblocking execution on shared-memory systems. The implementation of the `nonblocking` backend relies on that of the `reference` and `reference_omp` backends that perform sequential and multi-threaded blocking execution, respectively.
+
+
+## Overview of the sources files
+
+The source files for the `nonblocking` backend are maintained under the `src/graphblas/nonblocking` directory, and the header files are maintained under `include/graphblas/nonblocking`. Most of these files exist for the `reference` backend, and the `nonblocking` backend uses some additional files. In particular, the full list of the source files for the `nonblocking` backend are the following:
+
+* `analytic_model.cpp`
+* `init.cpp` (relies on `reference/init.cpp`)
+* `io.cpp`
+* `lazy_evaluation.cpp`
+* `pipeline.cpp`
+
+from which the `analytic_model.cpp`, `lazy_evaluation.cpp`, and `pipeline.cpp` exist only for the `nonblocking` backend, and they are the main source files for the implementation of the nonblocking execution. The `init.cpp` file invokes the corresponding functions of the `reference` backend. The header files of the `nonblocking` backend include:
+
+* `alloc.hpp` (delegates to `reference/alloc.hpp`)
+* `analytic_model.hpp`
+* `benchmark.hpp` (delegates to `reference/benchmark.hpp`)
+* `blas1.hpp`
+* `blas2.hpp`
+* `blas3.hpp`
+* `boolean_dispathcer_blas1.hpp`
+* `boolean_dispathcer_blas2.hpp`
+* `boolean_dispathcer_io.hpp`
+* `collectives.hpp` (delegates to `reference/collectives.hpp`)
+* `config.hpp`
+* `coordinates.hpp`
+* `exec.hpp` (delegates to `reference/exec.hpp`)
+* `forward.hpp`
+* `init.hpp`
+* `io.hpp`
+* `lazy_evaluation.hpp`
+* `matrix.hpp`
+* `pinnedVector.hpp`
+* `pipeline.hpp`
+* `properties.hpp`
+* `spmd.hpp` (delegates to `reference/spmd.hpp`)
+* `vector.hpp` (relies on `reference/vector.hpp`)
+* `vector_wrapper.hpp`
+
+from which the `analytic_model.hpp`, `boolean_dispathcer_blas1.hpp`, `boolean_dispathcer_blas2.hpp`, `boolean_dispathcer_io.hpp`, `lazy_evaluation.hpp`, `pipeline.hpp`, and `vector_wrapper.hpp` are used only for the `nonblocking` backend.
+The current implementation supports nonblocking execution only for level-1 and level-2 operations defined in the following files:
+
+* `nonblocking/io.hpp`
+* `nonblocking/blas1.hpp`
+* `nonblocking/blas2.hpp`
+
+and thus most of the code for the nonblocking execution is found in these three files. The level-3 operations defined in `blas3.hpp` and some defined in `blas2.hpp` incur blocking behaviour. If a program invokes these primitives while compiled using the nonblocking backend, a warning will be emitted to the standard error stream. Please check regularly for future releases that enable native nonblocking execution for these remaining primitives.
+
+
+## Lazy evaluation
+
+Lazy evaluation enables the loop fusion and loop tiling optimisations in a pure library implementation such as required by ALP/GraphBLAS. Dynamic data dependence analysis identifies operations that share data, and these operations are added as stages of the same pipeline. Operations grouped into the same pipeline may be executed in parallel and reuse data in cache. The design for nonblocking execution is fully dynamic, since the optimisations are performed at run-time and the pipelines may include operations of arbitrary control-flow. The nonblocking execution is fully automatic, since the performance parameters, i.e., the number of threads and the tile size, are selected based on an analytic model (defined in `analytic_model.cpp`).
+
+To illustrate lazy evaluation for the nonblocking backend, we use the `grb::set` operation that initialises all the elements of the output vector `x` with the value of an input scalar `val`. The code below shows the implementation of `grb::set` for the `reference` and `reference_omp` backends found in `reference/io.hpp`.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename DataType, typename T,
+	typename Coords
+>
+RC set(
+	Vector< DataType, reference, Coords > &x,
+	const T val,
+	...
+) {
+	...
+
+	const size_t n = size( x );
+	if( (descr & descriptors::dense) && nnz( x ) < n ) {
+		return ILLEGAL;
+	}
+
+	const DataType toCopy = static_cast< DataType >( val );
+
+	if( !(descr & descriptors::dense) ) {
+		internal::getCoordinates( x ).assignAll();
+	}
+	DataType * const raw = internal::getRaw( x );
+
+#ifdef _H_GRB_REFERENCE_OMP_IO
+	#pragma omp parallel
+	{
+		size_t start, end;
+		config::OMP::localRange( start, end, 0, n );
+#else
+		const size_t start = 0;
+		const size_t end = n;
+#endif
+		for( size_t i = start; i < end; ++ i ) {
+			raw[ i ] = internal::template ValueOrIndex< descr, DataType, DataType >::getFromScalar( toCopy, i );
+		}
+#ifdef _H_GRB_REFERENCE_OMP_IO
+	}
+#endif
+
+	assert( internal::getCoordinates( x ).nonzeroes() ==
+		internal::getCoordinates( x ).size() );
+
+	return SUCCESS;
+}
+```
+
+A typical operation of ALP/GraphBLAS includes a main for loop that iterates over all the elements (or only the nonzeroes) of the containers to perform the required computation. One additional step is to check if the `dense` descriptor is correctly used, i.e., none of the input and output vectors is sparse, and otherwise the error code `grb::ILLEGAL` is returned. It is also necessary to properly assign the coordinates of the output vector. In the case of the `grb::set` operation, the raw data of the output vector are initialised with the value of the input scalar within the body of the main loop. The check for the correct usage of the `dense` descriptor is performed before the main loop, and all the coordinates of the output vector are assigned by invoking `assignAll`. That is, the initialisation of the coordinates is performed in one step, since the output vector will be dense after the completion of this operation. If the `dense` descriptor is given by the user, the vector is supposed to be already dense, and thus the invocation of `assignAll` is omitted.
+
+To implement lazy evaluation in the ALP/GraphBLAS library implementation, the code of an operation is not necessarily executed when the corresponding function is invoked. Instead, the loop is added into a lambda function that corresponds to a stage of a pipeline, and the lambda function is stored and executed later. Lambda functions are an implementation decision that meshes well with template-based programming in ALP/GraphBLAS. The code below shows the implementation of the `grb::set` operation discussed above for the corresponding nonblocking implementation defined in `nonblocking/io.hpp`.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation,
+	typename DataType, typename T,
+	typename Coords
+>
+RC set(
+	Vector< DataType, nonblocking, Coords > &x, const T val,
+	...
+) {
+	...
+
+	RC ret = SUCCESS;
+
+	const DataType toCopy = static_cast< DataType >( val );
+	DataType * const raw = internal::getRaw( x );
+	const size_t n = internal::getCoordinates( x ).size();
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+
+	internal::Pipeline::stage_type func = [&x, toCopy, raw] (
+			internal::Pipeline &pipeline, size_t active_chunk_id, size_t max_num_chunks, size_t lower_bound, size_t upper_bound
+		) {
+			(void) active_chunk_id;
+			(void) max_num_chunks;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			if( !already_dense_vectors ) {
+				bool already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+					Coords local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+					local_x.local_assignAllNotAlreadyAssigned();
+					assert( local_x.nonzeroes() == local_x.size() );
+
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+				}
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; i++ ) {
+				raw[ i ] = internal::template ValueOrIndex< descr, DataType, DataType >::getFromScalar( toCopy, i );
+			}
+
+			return SUCCESS;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::IO_SET_SCALAR,
+			n, sizeof( DataType ), dense_descr, true,
+			&x, nullptr,
+			&internal::getCoordinates( x ), nullptr,
+			nullptr, nullptr, nullptr, nullptr,
+			nullptr, nullptr, nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+The implementation of `grb::set` for the `nonblocking` backend is very similar to that of the `reference` and `reference_omp` backends. In particular, a lambda function is defined for the execution of a subset of consecutive iterations of the initial loop determined by the `lower_bound` and `upper_bound` parameters. Therefore, the main loop iterates from `lower_bound` to `upper_bound` to initialise the raw data of the output vector. The main difference between the `nonblocking` backend and the `reference` backend is the way the coordinates are handled. First, it is impossible to check if the `dense` descriptor is correctly given in the beginning of an operation, because the computation may not be completed yet due to lazy evaluation and the number of nonzeroes of a vector may not be up to date. Therefore, the check for the `dense` descriptor must be moved into the lambda function. However, the coordinates used by the `nonblocking` backend require a different mechanism than that used by the `reference` backend. The design of the coordinates mechanism for the `nonblocking` backend is presented in the next section.
+
+
+## Handling sparse vectors
+
+Vectors in ALP/GraphBLAS may be either sparse or dense. In the case of dense vectors, each operation accesses all the elements as shown above with the example of `grb::set`. However, to efficiently handle sparsity, it is necessary to maintain the coordinates of the nonzeroes, such that ALP/GraphBLAS operations access only the nonzeroes. Hence, each vector includes a so-called Sparse Accumulator (SPA), consisting of the following data to handle sparsity:
+
+* an unsigned integer `_cap` that stores the size of the vector;
+* an unsigned integer `_n` that stores the number of nonzeroes in the vector;
+* a boolean array, `_assigned`, of size`_cap` that indicates if the element of a coordinate is a nonzero; and
+* an unsigned integer array, `_stack`, that represents a stack and stores the coordinates of the assigned elements.
+
+A vector is dense when the number of nonzeroes is equal to the size of the vector, i.e., `_n = _cap`.
+The stack and the `_assigned` array are used only when accessing a sparse vector.
+For an empty vector, `_n = 0`, all the elements of `_assigned` are initialised to `false`, and the stack is empty.
+The assignment of the i-th element of a vector implies that:
+```cpp
+_stack[_n] = i;
+_assigned[i] = true;
+_n++
+```
+Therefore, the coordinates of the nonzeroes are not sorted; they are pushed to the stack in an arbitrary order. Iterating over the nonzeroes of a sparse vector is done via the stack, and thus access to the elements may happen in any order.
+
+The internal representation of a vector is sufficient to correctly and efficiently handle sparse vectors for sequential execution. However, this is not the case for multi-threaded execution, since simultaneous assignments of vector elements may cause data races. Protecting the stack and the counter of nonzeroes with a global lock is a trivial solution that leads to significant performance degradation. Therefore, it is necessary to design a different mechanism that is tailored to the needs of the nonblocking execution and exploits any information about accesses of elements by different threads.
+
+
+## Local coordinates mechanism
+
+The local coordinates mechanism is used for efficient handling of sparse vectors in parallel nonblocking execution and is implemented in `coordinates.hpp`. The local coordinates mechanism consists of a set of local views for the coordinates stored in the global stack. Each local view includes the coordinates of the nonzeroes for a tile of iterations, and each thread access its own local coordinates and any update to the sparsity structure of a vector is performed in the local view. The local coordinates mechanism requires initialisation of the local views before the execution of the pipeline and update of the global stack with the new nonzeroes after the execution of the pipeline.
+
+The local coordinates mechanism requires some additional data for each tile of a vector:
+
+* an unsigned integer array that stores the number of nonzeroes for each local view, which are read from the global stack during initialisation;
+* an unsigned integer array that stores the number of nonzeroes that were assigned to each local view during the execution of a pipeline;
+* a set of unsigned integer arrays that represent local stacks and store the local coordinates, i.e., each array corresponds to a different local view.
+
+The local coordinates mechanism relies on five main functions defined in `nonblocking/coordinates.hpp`. The local views are initialised via `asyncSubsetInit`. Each operation reads the state of the local view with `asyncSubset`, and it updates the state with `asyncJoinSubset` once the computation is completed. The invocation of `joinSubset` pushes the local coordinates to the global stack. None of these functions uses locks, and to avoid data races, `joinSubset` updates the global stack based on the prefix-sum computation for the number of new nonzeroes performed by `prefixSumComputation`.
+
+To illustrate the usage of the local coordinates mechanism in the `nonblocking` backend, we use the in-place `grb::foldl` operation shown below, which receives one output vector, one input vector and an operator.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation, class OP,
+	typename IOType, typename InputType, typename Coords
+>
+RC foldl(
+	Vector< IOType, nonblocking, Coords > &x,
+	const Vector< InputType, nonblocking, Coords > &y,
+	const OP &op = OP(),
+	...
+) {
+	const size_t n = size( x );
+
+	...
+
+	RC ret = SUCCESS;
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+
+	internal::Pipeline::stage_type func = [&x, &y, &op, phase] (
+			internal::Pipeline &pipeline,
+			const size_t active_chunk_id, const size_t max_num_chunks,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz, local_y_nz;
+			bool sparse = false;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			bool already_dense_output = true;
+			bool already_dense_input = true;
+
+			if( !already_dense_vectors ) {
+				already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+					local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+				}
+
+				already_dense_input = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+				if( !already_dense_input ) {
+					local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+					local_y_nz = local_y.nonzeroes();
+					if( local_y_nz < local_n ) {
+						sparse = true;
+					}
+				}
+			}
+
+			if( sparse ) {
+				// performs the computation for the sparse case
+				...
+			} else {
+				// performs the computation for the dense case
+				...
+			}
+
+			if( !already_dense_output ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+			}
+
+			return rc;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+			n, sizeof( IOType ), dense_descr, true,
+			&x, nullptr,
+			&internal::getCoordinates( x ), nullptr,
+			&y, nullptr, nullptr, nullptr,
+			&internal::getCoordinates( y ), nullptr, nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+The state of the local view is read for each vector accessed in an operation by invoking `asyncSubset`. The sparsity structure may be updated only for the output vector, and thus `asyncJoinSubset` is invoked only for the output vector to update the number of new nonzeroes. Operations consider the dense and the sparse case, and the executed path is determined at run-time based on the sparsity structure of the local coordinates. To avoid the overhead of initialising the local views, the `nonblocking` backend performs compile-time and runtime optimisations discussed in the next section. Therefore, `asyncSubset` and `asyncJoinSubset` are conditionally invoked depending on whether the corresponding vectors are already dense.
+
+
+## Optimisations for dense vectors
+
+To improve the performance of nonblocking execution, it is crucial to avoid the usage of the local views when the vectors are dense. It is possible to determine whether a vector is dense based on compile-time information from descriptors and runtime analysis. The first one implies zero runtime overhead, but the descriptors must be provided by the user.
+
+There exist two main differences between the compile-time information from descriptors and the runtime analysis.
+First, descriptors may apply to all vectors of an operation, whereas the runtime analysis applies to each individual vector of an operation. Second, descriptors refer to the vectors of a specific operation, whereas the runtime analysis refers to the state of a vector before the execution of a pipeline.
+
+### Compile-time descriptors
+
+The ALP/GraphBLAS implementation provides a set of descriptors defined in `include/graphblas/descriptors.hpp`, and they may be combined using bit-wise operators.
+A descriptor is passed to an operation and indicates some information about some or all of the output and input containers, e.g., vectors and matrices.
+Three of these descriptors are the following:
+
+* `dense` to indicate that all input and output vectors are structurally dense before the invocation;
+* `structural` that ignores the values of the mask and uses only its structure, i.e., the i-th element evaluates to true if any value is assigned to it; and
+* `invert_mask` that inverts the mask.
+
+The `dense` and `structural` descriptors may affect both correctness and performance, and `invert_mask` affect only the correctness of an operation. These three descriptors may be used to perform optimisations for the local coordinates mechanism. In particular, if the dense descriptor is provided, it implies that all the vectors accessed in an operation are dense before the invocation. Therefore, an operation can safely iterate over all the elements of the vectors without using neither the global nor the local coordinates.
+
+One exception is an out-of-place operation that receives a mask, since the dense descriptor itself does not guarantee that all the elements of a dense mask evaluate to true. Therefore, a dense output vector may become sparse once the computation is completed. That is, the output vector becomes empty in the beginning of the operation, and then each of its coordinates may be assigned depending on whether the corresponding element of the mask evaluates to true or not. Reading the elements of a mask does not require usage of the local coordinates when the dense descriptor is given. However, to avoid the usage of the local coordinates for the output vector of an out-of-place operation that receives a mask, both the `structural` and the `invert_mask` descriptors should be given in addition to the `dense` descriptor.
+
+### Runtime analysis
+
+The runtime analysis for dense vectors relies on a simple property of ALP/GraphBLAS. A vector that is already dense before the execution of a pipeline cannot become sparse during the execution of the pipeline unless the pipeline contains an out-of-place operation, i.e., `grb::set`, `grb::eWiseApply`, or `grb::clear` that makes the vector empty. The current design for nonblocking execution in ALP/GraphBLAS allows pipelines that include an out-of-place operation but does not allow pipelines that include the `grb::clear` operation.
+
+The nonblocking execution relies on the runtime analysis to determine whether a vector is already dense before the execution of a pipeline, only when the `dense` descriptor is not given by the user. For each already dense vector of a pipeline, neither the global nor the local coordinates are used unless the vector is the output of an out-of-place operation. Therefore, the overhead of the local coordinates mechanism is completely avoided.
+
+### Implementation of the optimisation
+
+To illustrate the implementation of the compile-time and runtime optimisations for dense vectors, we use one example of an in-place and one example of an out-of-place operation.
+The runtime analysis relies on the `allAlreadyDenseVectors` function that returns `true` when all the vectors accessed in a pipeline are already dense, and on `containsAlreadyDenseContainer` that returns `true` when a specific vector accessed in a pipeline is already dense.
+
+#### In-place operations
+
+In the case of an in-place operation, we use the example of the `grb::foldl` operation discussed earlier.
+The code below is included in the lambda function of `grb::foldl`.
+
+```cpp
+const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+bool already_dense_output = true;
+bool already_dense_input = true;
+
+if( !already_dense_vectors ) {
+	already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+	if( !already_dense_output ) {
+		local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+		local_x_nz = local_x.nonzeroes();
+		if( local_x_nz < local_n ) {
+			sparse = true;
+		}
+	}
+
+	already_dense_input = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+	if( !already_dense_input ) {
+		local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+		local_y_nz = local_y.nonzeroes();
+		if( local_y_nz < local_n ) {
+			sparse = true;
+		}
+	}
+}
+
+...
+
+if( !already_dense_output ) {
+	internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+}
+```
+
+The variable `already_dense_vectors` indicates whether all the vectors accessed in this operation are already dense based on compile-time or runtime information.
+In addition, one variable is declared for each vector to indicate whether a vector is already dense, i.e., the variables `already_dense_output` and `already_dense_input` are initialised to `true`, assuming that the vectors are already dense.
+If `already_dense_vectors` is evaluated to true, the state of the local views is not read and the assumption for already dense vectors is correct.
+Otherwise, it is necessary to check if each vector accessed in the operation is already dense, and if this is not the case, the state of the local view is read by invoking `asyncSubset`.
+The update of the state for the local view is performed once the computation is completed via `asyncJoinSubset` only when the output vector is not already dense.
+
+#### Out-of-place operations
+
+For the implementation of the optimisation for dense vectors of an out-of-place operation, we use the example of the `grb::eWiseApply` operation defined in `blas1.hpp`.
+There exist four main scenarios we need to consider, depending on whether the output vector for a tile needs to become empty, dense, or both empty and dense, and whether the operation receives a mask.
+
+##### Out-of-place operation with a potentially sparse output vector
+
+In the case that the input consists of three vectors, the output vector will have an a-priori unknown sparsity structure.
+Therefore, unless all vectors are already dense, it is necessary to initialise the state of the output vector via `asyncSubset` and clear the coordinates of each local view by invoking `local_clear`.
+In contrast to an in-place operation, the decision about reading and updating the state of the output vector does not depend on whether the output vector is already dense,
+since an already dense output vector may become sparse depending on the sparsity structure of the input vectors.
+
+Since the current design for nonblocking execution does not allow the number of nonzeroes to decrease, it is necessary to reset the global counter of nonzeroes by invoking `reset_global_nnz_counter`.
+The `local_clear` function updates properly the number of new nonzeroes that should be written later to the global stack by `joinSubset`, i.e., all the nonzeroes of the local view are considered as new.
+In addition, the output vector is marked as potentially sparse by invoking `markMaybeSparseContainer`.
+Both of these functions are invoked only by the thread that executes the first tile, i.e., when `lower_bound = 0`.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation, class Monoid,
+	typename OutputType, typename InputType1, typename InputType2,
+	typename Coords
+>
+RC eWiseApply(
+	Vector< OutputType, nonblocking, Coords > &z,
+	const Vector< InputType1, nonblocking, Coords > &x,
+	const Vector< InputType2, nonblocking, Coords > &y,
+	const Monoid &monoid = Monoid(),
+	...
+) {
+	const size_t n = internal::getCoordinates( z ).size();
+
+	...
+
+	RC ret = SUCCESS;
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+
+	internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] (
+			internal::Pipeline &pipeline,
+			const size_t active_chunk_id, const size_t max_num_chunks,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y, local_z;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+				already_dense_input_x = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+					local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+					local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				}
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			const auto op = monoid.getOperator();
+
+			if( !already_dense_vectors ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+					pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+				}
+			}
+
+			// performs the computation
+			...
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+			}
+
+			return rc;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+			n, sizeof( OutputType ), dense_descr, true,
+			&z, nullptr,
+			&internal::getCoordinates( z ), nullptr,
+			&x, &y, nullptr, nullptr,
+			&internal::getCoordinates( x ), &internal::getCoordinates( y ), nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+##### Out-of-place operation with a dense output vector
+
+In the case that the input consists of a scalar and a monoid, it is guaranteed that the output vector will be dense.
+Therefore, the only criterion to avoid the usage of the local views is whether the output vector is already dense.
+If the output vector is not already dense, then the state of the local view is read, all the not assigned coordinates are assigned by invoking `local_assignAllNotAlreadyAssigned`, and the state is updated via `asyncJoinSubset`.
+
+```cpp
+
+template<
+	Descriptor descr = descriptors::no_operation, class Monoid,
+	typename OutputType, typename InputType1, typename InputType2,
+	typename Coords
+>
+RC eWiseApply(
+	Vector< OutputType, nonblocking, Coords > &z,
+	const InputType1 alpha,
+	const Vector< InputType2, nonblocking, Coords > &y,
+	const Monoid &monoid = Monoid(),
+	...
+) {
+	const size_t n = internal::getCoordinates( z ).size();
+
+	...
+
+	RC ret = SUCCESS;
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+
+	internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t active_chunk_id, const size_t max_num_chunks,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y, local_z;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			bool already_dense_output = true;
+			bool already_dense_input_y = true;
+
+			already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( z ) );
+			if( !already_dense_output ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+			}
+
+			if( !already_dense_vectors ) {
+				already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+					local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				}
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			const auto &op = monoid.getOperator();
+
+			if( !already_dense_output ) {
+				local_z.local_assignAllNotAlreadyAssigned();
+			}
+
+			// performs the computation
+			...
+
+			if( !already_dense_output ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+			}
+
+			return rc;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+			n, sizeof( OutputType ), dense_descr, true,
+			&z, nullptr,
+			&internal::getCoordinates( z ), nullptr,
+			&y, nullptr, nullptr, nullptr,
+			&internal::getCoordinates( y ), nullptr, nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+##### Out-of-place operation with an output vector that consists of some potentially sparse tiles and some dense tiles
+
+In the case that the input consists of an operator instead of a monoid, the output vector may become sparse after the computation unless all vectors are already dense.
+Therefore, the global counter of nonzeroes is reset, and the decision about clearing the local coordinates or assigning all of them is made separately for each local view.
+The vector is marked as potentially sparse when the local coordinates are cleared for at least one of the tiles.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation, class OP,
+	typename OutputType, typename InputType1, typename InputType2,
+	typename Coords
+>
+RC eWiseApply(
+	Vector< OutputType, nonblocking, Coords > &z,
+	const InputType1 alpha,
+	const Vector< InputType2, nonblocking, Coords > &y,
+	const OP &op = OP(),
+	...
+) {
+	const size_t n = internal::getCoordinates( z ).size();
+
+	...
+
+	RC ret = SUCCESS;
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+
+	internal::Pipeline::stage_type func = [&z, alpha, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t active_chunk_id, const size_t max_num_chunks,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_y_nz = local_n;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+				already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+					local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+					local_y_nz = local_y.nonzeroes();
+				}
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+				}
+			}
+
+			if( (descr & descriptors::dense) || local_y_nz == local_n ) {
+				if( !already_dense_vectors ) {
+					local_z.local_assignAll( );
+				}
+
+				// performs the computation for the dense case
+				...
+			} else {
+				if( !already_dense_vectors ) {
+					local_z.local_clear();
+					pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+				}
+
+				// performs the computation for the sparse case
+				...
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+			}
+
+			return rc;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+			n, sizeof( OutputType ), dense_descr, true,
+			&z, nullptr,
+			&internal::getCoordinates( z ), nullptr,
+			&y, nullptr, nullptr, nullptr,
+			&internal::getCoordinates( y ), nullptr, nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+##### Out-of-place operation that receives a mask
+
+In the case that an out-of-place operation receives a mask, a second variable, `mask_is_dense`, is used to indicate whether the mask is dense based on compile-time information from descriptors or the runtime analysis for already dense vectors.
+Then, all the decisions about the output vector are made based on this variable.
+In addition, the function `markMaybeSparseDenseDescriptorVerification` is invoked to mark the output vector as potentially sparse when the `dense` descriptor is provided and the elements of the mask may be evaluated to `false` as explained in the section about the dense descriptor verification.
+
+```cpp
+template<
+	Descriptor descr = descriptors::no_operation, class Monoid,
+	typename OutputType, typename MaskType,
+	typename InputType1, typename InputType2,
+	typename Coords
+>
+RC eWiseApply(
+	Vector< OutputType, nonblocking, Coords > &z,
+	const Vector< MaskType, nonblocking, Coords > &mask,
+	const InputType1 alpha,
+	const Vector< InputType2, nonblocking, Coords > &y,
+	const Monoid &monoid = Monoid(),
+	...
+) {
+	const size_t n = internal::getCoordinates( z ).size();
+
+	...
+
+	RC ret = SUCCESS;
+
+	constexpr const bool dense_descr = descr & descriptors::dense;
+	constexpr const bool dense_mask = dense_descr && (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+	internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t active_chunk_id, const size_t max_num_chunks,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+
+			const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+				already_dense_mask = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+					local_mask = internal::getCoordinates( mask ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+					local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+				}
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			const InputType2 right_identity = monoid.template getIdentity< InputType2 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+					pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification( &internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			// performs the computation
+			...
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+			}
+
+			return rc;
+		};
+
+	ret = ret ? ret : internal::le.addStage(
+			std::move( func ), internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+			n, sizeof( OutputType ), dense_descr, dense_mask,
+			&z, nullptr,
+			&internal::getCoordinates( z ), nullptr,
+			&y, &mask, nullptr, nullptr,
+			&internal::getCoordinates( y ), &internal::getCoordinates( mask ), nullptr, nullptr
+		);
+
+	return ret;
+}
+```
+
+
+## Pipeline execution
+
+The nonblocking execution in ALP/GraphBLAS expresses operations as a linear sequence of stages that form a pipeline. The execution of a pipeline is performed when the computation is necessary for the sound execution of the program. Opaqueness guarantees that lazy evaluation is safe when the output of an operation is a container, i.e., a vector or a matrix. The current version of ALP/GraphBLAS does not implement scalars as opaque data types according to the [version 1.3.0](https://graphblas.org/docs/GraphBLAS_API_C_v1.3.0.pdf) of the C API specification. Opaque scalars were introduced later in the [version 2.0.0](https://graphblas.org/docs/GraphBLAS_API_C_v2.0.0.pdf) and may further improve the performance of nonblocking execution.
+
+A pipeline must be executed in the following cases:
+
+* the user explicitly extracts data from a container by using the ALP/GraphBLAS API, e.g., when reading the elements of a vector by using iterators;
+
+* the user invokes the constructor of a container;
+
+* memory is deallocated due to a destructor invocation;
+
+* the invoked operation returns a scalar, e.g., the `grb::dot` operation, in particular, the operation is first added into the pipeline, and then the pipeline is executed immediately before returning the scalar;
+
+* when a sparse matrix–vector multiplication (SpMV) operation is added into a pipeline with another operation that overwrites the input vector to the SpMV;
+
+* when the user explicitly forces the execution of a pipeline via a call to `grb::wait`.
+
+Although level-3 operations are not yet implemented for nonblocking execution, a sparse matrix–sparse matrix multiplication (SpMSpM) operation implies the same constraint with SpMV, i.e., the SpMSpM operation cannot be fused together with another operation that overwrites any of the SpMSpM input matrices.
+
+When a new stage is added to a pipeline, the pipeline execution is performed within the `addStage` function of `lazy_evaluation.cpp`, which implements the dynamic data dependence analysis and identifies any shared data between operations. The pipeline execution due to explicit invocation of iterators or constructors or memory deallocation is performed in `vector.hpp`. The execution of a pipeline caused by `grb::wait` is implemented in `io.hpp`.
+
+The code for the pipeline execution is found in the `execution` method of `pipeline.cpp`. The execution is performed in four main steps, three of which may be omitted when the pipeline does not include any out-of-place operation and all accessed vectors are dense. Simplified code for the execution of the four main steps is shown below.
+
+```cpp
+bool initialized_coordinates = false;
+
+#pragma omp parallel for private(vt, pt) schedule(dynamic) num_threads(nthreads)
+for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+	...
+	for( vt = vbegin(); vt != vend(); ++vt ) {
+		...
+		(**vt).asyncSubsetInit( tile_id, tiles, lower_bound, upper_bound );
+		initialized_coordinates = true;
+	}
+}
+
+#pragma omp parallel for private(vt, pt) schedule(dynamic) num_threads(nthreads)
+for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+	...
+	RC local_ret = SUCCESS;
+	for( pt = pbegin(); pt != pend(); ++pt ) {
+		local_ret = local_ret ? local_ret : (*pt)( *this, tile_id, tiles, lower_bound, upper_bound );
+	}
+	if( local_ret != SUCCESS ) {
+		ret = local_ret;
+	}
+}
+
+if( initialized_coordinates ) {
+	bool new_nnz = false;
+
+	for( vt = vbegin(); vt != vend(); ++vt ) {
+		...
+		if( (**vt).newNonZeroes( tiles ) ) {
+			new_nnz = true;
+			(**vt).prefixSumComputation( tiles );
+		}
+	}
+
+	if( new_nnz ) {
+		#pragma omp parallel for private(vt) schedule(dynamic) num_threads(nthreads)
+		for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+			...
+			for( vt = vbegin(); vt != vend(); ++vt ) {
+				...
+				if( (**vt).newNonZeroes( tiles ) ) {
+					(**vt).joinSubset( tile_id, tiles, lower_bound, upper_bound );
+				}
+			}
+		}
+	}
+}
+```
+The local views of each vector accessed in the pipeline are initialised via `asyncSubsetInit`, and then the pipeline is executed. Once the execution is completed, the local views may contain a number of new nonzeroes that must be pushed to the global stack by `joinSubset`. Before this step, it is necessary to perform the prefix-sum computation for the number of new nonzeroes of each local view by invoking `prefixSumComputation`. All these steps may be executed in parallel for different tiles of the vectors as shown with the OpenMP directives, except for the prefix-sum computation that is parallelised internally. The scheduling policy used for OpenMP is dynamic to handle load imbalance, and the performance parameters, i.e., the number of threads and the tile size used in the lambda functions, are automatically selected by the analytic model (see `analytic_model.cpp`).
+
+
+## Analytic performance model
+
+The analytic performance model used for nonblocking execution consists of the `getPerformanceParameters` function defined in `analytic_model.cpp`, and this function is invoked before the pipeline execution within the `execution` method in `pipeline.cpp`. The analytic model makes an estimation about the number of threads and the tile size that lead to good performance for the execution of a given pipeline, and the estimation is based on various parameters such as the number of vectors accessed in the pipeline, the data type of the vectors, and the size of the vectors. Two additional parameters of special importance are the size of the L1 cache and the number of cores available in the system, since the selected tile size must allow data fit in L1 cache and there should be sufficient work to utilise as many cores as possible.
+
+The analytic model relies on two environment variables:
+
+* `OMP_NUM_THREADS`
+* `GRB_NONBLOCKING_TILE_SIZE`
+
+for the number of threads used by OpenMP and the tile size used by the nonblocking backend, respectively. The number of threads determined by the environment variable is an upper bound for the number of threads that may be selected by the analytic model. If the environment variable for the tile size is set, a fixed tile size is used for all executed pipelines. Otherwise, the analytic model automatically selects a proper tile size, depending on the parameters of the executed pipeline.
+
+The initialisation for the number of threads used by OpenMP and the manual tile size is performed in `init.cpp`, and the data of the analytic model are handled by the `ANALYTIC_MODEL` and `IMPLEMENTATION` classes of `config.hpp`.
+
+
+## Dense descriptor verification
+
+The correct usage of the `dense` descriptor, for the blocking execution, is checked in the beginning of each ALP/GraphBLAS operation.
+If there exists at least one input or output vector that is not dense, then the `grb::ILLEGAL` error code is returned as shown in the example below.
+
+```cpp
+const size_t n = size( x );
+if( (descr & descriptors::dense) && nnz( x ) < n ) {
+	return ILLEGAL;
+}
+```
+
+For the nonblocking execution, checking the correct usage of the `dense` descriptor requires a different process, since the number of nonzeroes in the vectors may not be up to date due to lazy evaluation.
+In particular, the check is moved within the lambda function defined for each operation, and the check for the sparsity structure is based on the local views.
+However, the optimisation employed by the nonblocking execution for already dense vectors implies that the local views are not always available.
+Therefore, it is not always possible to perform the check for correct usage of the `dense` descriptor within the lambda function of an operation.
+
+The verification process for correct usage of the `dense` descriptor relies on the following property:
+
+*A vector that should be dense when an operation is invoked, should remain dense after the execution of the pipeline, unless this vector is the output of an out-of-place operation that receives a mask with elements that may be evaluated to `false`*.
+
+Therefore, the `nonblocking` backend delays the check and performs the verification for correct usage of the `dense` descriptor after the pipeline execution.
+To keep track of the vectors that should be dense after the execution of the pipeline, the addition of a lambda function as a stage of a pipeline is accompanied by a boolean variable, called `dense_descr`, that indicates if the `dense` descriptor is given for this operation.
+In the case of an out-of-place operation that receives a mask, e.g., `grb::eWiseApply` discussed earlier, the output vector may be marked as potentially sparse when the `dense` descriptor is provided, by invoking `markMaybeSparseDenseDescriptorVerification` as shown in the example of `grb::eWiseApply` above.
+In this case, the dense descriptor verification is disabled for the output vector of this specific operation.
+
+This solution is efficient and catches most cases of an illegal `dense` descriptor.
+However, it cannot catch an illegal usage of the `dense` descriptor for an operation that receives a sparse vector, which becomes dense during the execution of the pipeline, since it is impossible to detect that the vector was not dense earlier.
+
diff --git a/docs/Suppressions.md b/docs/Suppressions.md
index 1915147b5..630b044ab 100644
--- a/docs/Suppressions.md
+++ b/docs/Suppressions.md
@@ -48,41 +48,17 @@ if( masked ) {
 ```
 
 4. `include/graphblas/base/internalops.hpp`, multiple sources:
-- mul::apply, add::apply, add::foldl, equal::apply, not_equal::apply.
+- mul::apply, add::apply, add::foldl, equal::apply, not_equal::apply, and
+  logical_and::foldl.
 
 These are indirectly caused by the following calls:
 - `include/graphblas/blas0.hpp`, apply;
 - `include/graphblas/reference/blas1.hpp`, dot_generic, masked_apply_generic,
-  and sparse_apply_generic.
+  sparse_apply_generic, and fold_from_vector_to_scalar_generic.
 
 These are all OK to suppress since the reads are masked.
 
-5. `include/graphblas/reference/blas1.hpp`, fold_from_vector_to_scalar_generic:
-```
-GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the below code ensures to set local
-IOType local;                       // whenever our local block is
-GRB_UTIL_RESTORE_WARNINGS           // non-empty
-if( end > 0 ) {
-	if( i < end ) {
-		local = static_cast< IOType >( internal::getRaw( to_fold )[ i ] );
-	} else {
-		local = static_cast< IOType >( internal::getRaw( to_fold )[ 0 ] );
-	}
-}
-```
-and
-```
-if( root == s ) {
-	// then I should be non-empty
-	assert( !empty );
-	// set global value to locally computed value
-	GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // one is only root if the local
-	global = local;                     // chunk is non-empty, in which case
-	GRB_UTIL_RESTORE_WARNINGS           // local will be initialised (above)
-	}
-```
-
-6. `include/graphblas/reference/blas1.hpp`, masked_apply_generic:
+5. `include/graphblas/reference/blas1.hpp`, masked_apply_generic:
 ```
 if( mask_b[ t ] ) {
 	// ...
@@ -91,3 +67,18 @@ if( mask_b[ t ] ) {
 	GRB_UTIL_RESTORE_WARNINGS            // if mask_b is true
 ```
 
+6. `include/graphblas/nonblocking/blas1.hpp`, masked_apply_generic:
+```
+for( size_t k = 0; k < block_size; ++k ) {
+	const size_t index = i + k;
+	assert( index < local_n + lower_bound );
+	if( mask_b[ k ] ) {
+		(void) local_z.assign( index - lower_bound );
+		GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with
+		*( z_p + index ) = z_b[ k ];        // mask_b[ k ], which in the above
+		GRB_UTIL_RESTORE_WARNINGS           // loop also triggeres initialising
+		                                    // z_b[ k ]
+	}
+}
+```
+
diff --git a/docs/doxy.conf b/docs/doxy.conf
index d91dae080..d1e63f220 100644
--- a/docs/doxy.conf
+++ b/docs/doxy.conf
@@ -1,20 +1,4 @@
-# Doxyfile 1.8.14
-
-#
-#   Copyright 2021 Huawei Technologies Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
+# Doxyfile 1.9.3
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -33,10 +17,10 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
 # https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
@@ -48,19 +32,19 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "ALP/GraphBLAS"
+PROJECT_NAME           = "ALP Developer Documentation"
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 0.6.0
+PROJECT_NUMBER         = 0.7.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          =
+PROJECT_BRIEF          = "Algebraic Programming Developer Documentation"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -74,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = docs/code
+OUTPUT_DIRECTORY       = docs/developer
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -149,7 +133,7 @@ ALWAYS_DETAILED_SEC    = NO
 # operators of the base classes will not be shown.
 # The default value is: NO.
 
-INLINE_INHERITED_MEMB  = NO
+INLINE_INHERITED_MEMB  = YES
 
 # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
 # before files name in the file list and in the header files. If set to NO the
@@ -195,6 +179,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = YES
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -215,6 +209,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -238,21 +240,19 @@ TAB_SIZE               = 4
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -281,28 +281,40 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -314,7 +326,7 @@ MARKDOWN_SUPPORT       = YES
 # to that level are automatically included in the table of contents, even if
 # they do not have an id attribute.
 # Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
+# Minimum value: 0, maximum value: 99, default value: 5.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
 TOC_INCLUDE_HEADINGS   = 0
@@ -430,6 +442,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -448,13 +473,19 @@ EXTRACT_ALL            = NO
 # be included in the documentation.
 # The default value is: NO.
 
-EXTRACT_PRIVATE        = NO
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
 
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
 
-EXTRACT_PACKAGE        = NO
+EXTRACT_PACKAGE        = YES
 
 # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
 # included in the documentation.
@@ -476,7 +507,7 @@ EXTRACT_LOCAL_CLASSES  = YES
 # included.
 # The default value is: NO.
 
-EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_LOCAL_METHODS  = YES
 
 # If this flag is set to YES, the members of anonymous namespaces will be
 # extracted and appear in the documentation as a namespace called
@@ -487,6 +518,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -504,8 +542,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -522,13 +560,20 @@ HIDE_IN_BODY_DOCS      = NO
 # will be excluded. Set it to YES to include the internal documentation.
 # The default value is: NO.
 
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+INTERNAL_DOCS          = YES
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -547,6 +592,12 @@ HIDE_SCOPE_NAMES       = YES
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -704,7 +755,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -750,23 +802,35 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -783,7 +847,10 @@ WARN_FORMAT            = "$file:$line: $text"
 
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -802,8 +869,8 @@ INPUT                  = include/
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -816,11 +883,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.hpp \
                          *.cpp \
@@ -862,7 +933,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
+# ANamespace::AClass, ANamespace::*Test
 #
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
@@ -980,7 +1051,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1017,7 +1088,7 @@ SOURCE_TOOLTIPS        = YES
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1050,13 +1121,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1156,7 +1220,7 @@ HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
+# this color. Hue is specified as an angle on a color-wheel, see
 # https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
@@ -1166,7 +1230,7 @@ HTML_EXTRA_FILES       =
 HTML_COLORSTYLE_HUE    = 220
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1195,9 +1259,9 @@ HTML_TIMESTAMP         = YES
 
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
-# are dynamically created via Javascript. If disabled, the navigation index will
+# are dynamically created via JavaScript. If disabled, the navigation index will
 # consists of multiple levels of tabs that are statically embedded in every HTML
-# page. Disable this option to support browsers that do not have Javascript,
+# page. Disable this option to support browsers that do not have JavaScript,
 # like the Qt help browser.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1227,13 +1291,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1247,6 +1312,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1272,8 +1344,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1303,7 +1379,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1348,7 +1424,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1356,7 +1433,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1364,28 +1442,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1428,16 +1508,28 @@ DISABLE_INDEX          = NO
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = NO
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1462,6 +1554,24 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1482,8 +1592,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1493,11 +1609,29 @@ FORMULA_TRANSPARENT    = YES
 
 USE_MATHJAX            = NO
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1510,22 +1644,29 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1553,7 +1694,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1713,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1585,8 +1727,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1637,21 +1780,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1681,29 +1838,31 @@ PAPER_TYPE             = a4
 
 EXTRA_PACKAGES         = amsmath
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1736,9 +1895,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1746,8 +1907,7 @@ USE_PDFLATEX           = YES
 
 # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
 # command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# if errors occur, instead of asking the user for help.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1760,16 +1920,6 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1786,6 +1936,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1825,9 +1983,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1836,22 +1994,12 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1923,6 +2071,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1941,15 +2096,6 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
@@ -2124,34 +2270,10 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2208,11 +2330,14 @@ DOT_FONTSIZE           = 10
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
@@ -2249,10 +2374,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2319,6 +2466,13 @@ GRAPHICAL_HIERARCHY    = YES
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
@@ -2372,10 +2526,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2437,14 +2591,18 @@ DOT_MULTI_TARGETS      = YES
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/docs/user.conf b/docs/user.conf
new file mode 100644
index 000000000..c39f53a38
--- /dev/null
+++ b/docs/user.conf
@@ -0,0 +1,2634 @@
+# Doxyfile 1.9.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "ALP User Documentation"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = 0.7.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Algebraic Programming User Documentation"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs/user
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = NO
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = YES
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = NO
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = NO
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = include/graphblas.hpp \
+                         include/graphblas/base \
+                         include/graphblas/algorithms \
+                         include/graphblas/interfaces \
+                         include/transition \
+                         include/graphblas/descriptors.hpp \
+                         include/graphblas/semiring.hpp \
+                         include/graphblas/monoid.hpp \
+                         include/graphblas/iomode.hpp \
+                         include/graphblas/ops.hpp \
+                         include/graphblas/descriptors.hpp \
+                         include/graphblas/rc.hpp \
+                         include/graphblas/reference/config.hpp \
+                         include/graphblas/nonblocking/config.hpp \
+                         include/graphblas/bsp1d/config.hpp \
+                         include/graphblas/identities.hpp \
+                         include/graphblas/phase.hpp \
+                         include/graphblas/type_traits.hpp \
+                         include/graphblas/backends.hpp \
+                         include/graphblas/blas0.hpp #\
+#                         include/graphblas/utils \
+#                         include/graphblas/utils.hpp
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.hpp \
+                         *.cpp \
+                         *.h \
+                         *.c
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = include/graphblas/base/alloc.hpp \
+                         include/graphblas/base/coordinates.hpp \
+                         include/graphblas/base/distribution.hpp \
+                         include/graphblas/base/internalops.hpp \
+                         include/graphblas/algorithms/hpcg #\
+#                         include/graphblas/base/init.hpp
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = internal
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = examples/sp.cpp \
+                         include/graphblas/ops.hpp \
+                         include/graphblas/internalops.hpp
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         = amsmath
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = __DOXYGEN__
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 458eadbcb..c1268623b 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -21,8 +21,9 @@
 # and definitions to compile against each backend, but MUST explicitly
 # set a default backend (if they want to do so).
 #
-assert_defined_variables( REFERENCE_INCLUDE_DEFS REFERENCE_OMP_INCLUDE_DEFS LPF_INCLUDE_DEFS
-	WITH_REFERENCE_BACKEND_HEADERS WITH_OMP_BACKEND_HEADERS WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
+assert_defined_variables( REFERENCE_INCLUDE_DEFS REFERENCE_OMP_INCLUDE_DEFS NONBLOCKING_INCLUDE_DEFS LPF_INCLUDE_DEFS
+	WITH_REFERENCE_BACKEND_HEADERS WITH_OMP_BACKEND_HEADERS WITH_NONBLOCKING_BACKEND WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
+	HYPERDAGS_INCLUDE_DEFS WITH_HYPERDAGS_BACKEND_HEADERS WITH_HYPERDAGS_BACKEND
 )
 assert_valid_variables( INCLUDE_INSTALL_DIR NO_NUMA_DEF )
 
@@ -41,7 +42,7 @@ set( HEADERS_REGEX ".+\.(hpp|h|hxx|hh|h\\+\\+)$" )
 # to avoid flaky acrobatics with regex or glob expressions, copy main files directly
 install( FILES "graphblas.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
 set( root_files
-	"graphblas.hpp" "graphblas/backends.hpp" "graphblas/benchmark.hpp"
+	"graphblas/backends.hpp" "graphblas/benchmark.hpp"
 	"graphblas/blas0.hpp" "graphblas/blas1.hpp" "graphblas/blas2.hpp"
 	"graphblas/blas3.hpp" "graphblas/collectives.hpp" "graphblas/config.hpp"
 	"graphblas/coordinates.hpp" "graphblas/descriptors.hpp" "graphblas/distribution.hpp"
@@ -104,7 +105,6 @@ install( TARGETS alp_utils_headers EXPORT GraphBLASTargets
 	INCLUDES DESTINATION "${INCLUDE_INSTALL_DIR}"
 )
 
-
 if( WITH_REFERENCE_BACKEND_HEADERS )
 	add_library( backend_reference_headers INTERFACE )
 	target_link_libraries( backend_reference_headers INTERFACE backend_headers_nodefs )
@@ -137,9 +137,34 @@ if( WITH_OMP_BACKEND_HEADERS )
 		FILES_MATCHING REGEX "${HEADERS_REGEX}"
 	)
 	install( TARGETS backend_reference_omp_headers EXPORT GraphBLASTargets )
+endif()
 
+if( WITH_HYPERDAGS_BACKEND )
+	add_library( backend_hyperdags_headers INTERFACE )
+	target_link_libraries( backend_hyperdags_headers INTERFACE "backend_${WITH_HYPERDAGS_USING}_headers" )
+	target_compile_definitions( backend_hyperdags_headers INTERFACE "${HYPERDAGS_INCLUDE_DEFS}" )
+	install( TARGETS backend_hyperdags_headers EXPORT GraphBLASTargets )
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/hyperdags/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/hyperdags"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
 endif()
 
+if( WITH_NONBLOCKING_BACKEND )
+	add_library( backend_nonblocking_headers INTERFACE )
+	# the nonblocking backend depends on the reference backend
+	target_link_libraries( backend_nonblocking_headers INTERFACE backend_reference_headers )
+	target_link_libraries( backend_nonblocking_headers INTERFACE OpenMP::OpenMP_CXX )
+	target_compile_definitions( backend_nonblocking_headers INTERFACE
+		"${NONBLOCKING_INCLUDE_DEFS}"
+	)
+
+	install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/nonblocking/"
+		DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/nonblocking"
+		FILES_MATCHING REGEX "${HEADERS_REGEX}"
+	)
+	install( TARGETS backend_nonblocking_headers EXPORT GraphBLASTargets )
+endif()
 
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	# copy headers, which are common to both distributed backends
@@ -187,6 +212,11 @@ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/algorithms/"
 	FILES_MATCHING REGEX "${HEADERS_REGEX}"
 )
 
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/interfaces/"
+	DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/interfaces"
+	FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
 install( TARGETS algorithms EXPORT GraphBLASTargets )
 
 # this target lists the transition path headers
diff --git a/include/graphblas.hpp b/include/graphblas.hpp
index 64df3b9e4..a539a5c0d 100644
--- a/include/graphblas.hpp
+++ b/include/graphblas.hpp
@@ -15,53 +15,178 @@
  * limitations under the License.
  */
 
-/*
- * @author: A. N. Yzelman.
- * @date: 8th of August, 2016.
+/**
+ * @file
+ *
+ * The main header to include in order to use the ALP/GraphBLAS API.
+ *
+ * @author A. N. Yzelman.
+ * @date 8th of August, 2016.
  */
 
 /** \mainpage
  *
- * This is a GraphBLAS implementation in ANSI C++11. Authors:
- *   -# A. N. Yzelman, Huawei Technologies France; 2016-2020.
- *   -# A. N. Yzelman, Huawei Technologies Switzerland AG; 2020-current.
- *   -# Aristeidis Mastoras, Huawei Technologies Switzerland AG; 2020-current.
- *   -# Alberto Scolari, Huawei Technologies Switzerland AG; 2021-current.
- *   -# Verner Vlacic, Huawei Technologies Switzerland AG; 2021-current.
- *   -# Auke Booij, Huawei Technologies Switzerland AG; 2021.
- *   -# Dan Iorga, Huawei Technologies Switzerland AG; 2021.
- *   -# Daniel Di Nardo, Huawei Technologies France; 2017.
- *   -# Jonathan M. Nash, Huawei Technologies France; 2017.
+ * The Algebraic Programming (ALP) project is a modern and humble C++
+ * programming framework that achieves scalable and high performance.
+ *
+ * With ALP, programmers are encouraged to express programs using algebraic
+ * concepts directly. ALP is a humble programming model in that it hides all
+ * optimisations pertaining to parallelisation, vectorisation, and other
+ * complexities with programming large-scale and heterogeneous systems.
+ *
+ * ALP presently exposes the following interfaces:
+ *  -# generalised sparse linear algebra, \ref GraphBLAS;
+ *  -# vertex-centric programming, \ref Pregel.
+ *
+ * Several other programming interfaces are under design at present.
  *
- * Contact: albertjan.yzelman@huawei.com
+ * For authors who contributed to ALP, please see the NOTICE file.
  *
- * This API exposes only two containers: a #grb::Vector and a #grb::Matrix.
+ * Contact:
+ *  - https://github.com/Algebraic-Programming/ALP
+ *  - https://gitee.com/CSL-ALP/graphblas/
+ *  - albertjan.yzelman@huawei.com
  *
- * All primitives defined on these containers must be given a (binary)
- * operator, a #grb::Monoid, or a #grb::Semiring. These monoid and semiring are
- * generalised from their mathematical counterpart in that they holds multiple
- * domains. The monoid consists of one binary operator and a corresponding
- * identity. The semiring consists of one additive operator, one multiplicative
- * operator, one identity under addition, and one identity under multiplication.
+ * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ *
+ * \defgroup GraphBLAS ALP/GraphBLAS
+ * @{
+ *
+ * @brief ALP/GraphBLAS enables sparse linear algebraic programming.
+ *
+ * \parblock
+ * \par API introduction
+ *
+ * ALP/GraphBLAS is an ANSI C++11 variant of the C GraphBLAS standard with a few
+ * different choices and an emphasis on portability and auto-parallelisation. It
+ * exposes only two containers: #grb::Vector and #grb::Matrix. A template
+ * argument controls the type of the values contained within a container.
+ *
+ * A container may have between \f$ 0 \f$ and \f$ c \f$ values, and each such
+ * value has a coordinate. The value \f$ c \f$ is the \em capacity of a
+ * container, and at most equals the \em size of that container. The size of a
+ * matrix is the product of its number of rows and its number of columns.
+ * Containers with fewer values than their size are considered \em sparse, while
+ * those with as many values as their size are considered \em dense. Scalars
+ * correspond to the standard C++ plain-old-data types, and, as such, have size,
+ * capacity, and number of values equal to one-- scalars are always dense.
+ *
+ * For matrices, their size can be derived from #grb::nrows and #grb::ncols,
+ * while for vectors their size may be immediately retrieved via #grb::size.
+ * For both vectors and matrices, their capacity and current number of values
+ * may be retrieved via #grb::capacity and #grb::nnz, respectively. Finally,
+ * containers have a unique identifier that may be retrieved via #grb::getID.
+ * These identifiers are assigned in a deterministic fashion, so that for
+ * deterministic programs executed with the same number of processes, the same
+ * containers will be assigned the same IDs.
+ *
+ * Containers may be populated using #grb::set or by using dedicated I/O
+ * routines such as #grb::buildVectorUnique or #grb::buildMatrixUnique. Here,
+ * \em unique refers to the collection of values that should be ingested having
+ * no duplicate coordinates; i.e., there are no two values that map to the same
+ * coordinate. The first argument to either function is the output container,
+ * which is followed by an iterator pair that points to a collection of values
+ * to be ingested into the output container.
+ *
+ * ALP/GraphBLAS supports multiple user processes \f$ P \f$. If \f$ P > 1 \f$,
+ * there is a difference between #grb::SEQUENTIAL and #grb::PARALLEL I/O. The
+ * default I/O mode is #grb::PARALLEL, which may be overridden by supplying
+ * #grb::SEQUENTIAL as a fourth and final argument to the input routines. In
+ * sequential I/O, the iterator pair must point to the exact same collection
+ * of input values on each of the \f$ P \f$ user processes. In the parallel
+ * mode, however, each iterator pair points to disjoint value sets at each of
+ * the processes, while their union is what is logically ingested into the
+ * output container.
+ *
+ * Output iteration is done using the standard STL-style iterators. ALP,
+ * however, only supports const_iterators on output. Output iterators default
+ * to sequential mode also.
+ *
+ * Primitives perform algebraic operations on containers while using explicitly
+ * supplied algebraic structures. Primitives may be as simple as the
+ * element-wise application of a binary operator to two input vectors,
+ * generating values in a third output vector (\f$ z = x \odot y \f$,
+ * #grb::eWiseApply), or may be as rich as multiplying two matrices together
+ * whose result is to be added in-place to a third matrix
+ * (\f$ C \leftarrow C + AB \f$, #grb::mxm). The latter is typically deemed
+ * richer since it requires a semiring structure rather than a more basic binary
+ * operator.
+ *
+ * Primitives are grouped according to their classical BLAS levels:
+ *  - \ref BLAS0
+ *  - \ref BLAS1
+ *  - \ref BLAS2
+ *  - \ref BLAS3
+ *
+ * The "level-0" primitives operate on scalars, and in terms of arithmetic
+ * intensity match those of level-1 primitives-- however, since standard BLAS
+ * need not define scalar operations this specification groups them separately.
+ * All primitives except for #grb::set and #grb::eWiseApply are \em in-place,
+ * meaning that new output values are "added" to any pre-existing contents in
+ * output containers. The operator used for addition is derived from the
+ * algebraic structure that the primitive is called with.
+ *
+ * ALP requires that every primitive is \em parallelisable. Every backend that
+ * implements primitive for a specific system furthermore must specify
+ * <em>performance semantics</em>. Contrary to functional semantics that this
+ * reference specifies, performance semantics guarantee certain observable
+ * behaviours when it comes to the amount of work, data movement,
+ * synchronisation across parallel systems, and/or memory use.
+ *
+ * @see perfSemantics
+ * \endparblock
+ *
+ * \parblock
+ * \par Algebraic Structures
+ *
+ * ALP/GraphBLAS defines three types of algebra structures, namely, a
+ *  -# binary operator such as #grb::operators::add (numerical addition),
+ *  -# #grb::Monoid, and
+ *  -# #grb::Semiring.
+ *
+ * Binary operators are parametrised in two input domains and one output domain,
+ * \f$ D_1 \times D_2 \to D_3 \f$. The \f$ D_i \f$ are given as template
+ * arguments to the operator. A #grb::Monoid is composed from a binary operator
+ * coupled with an identity. For example, the additive monoid is defined as
+ * \code
+ *  grb::Monoid<
+ *    grb::operators::add< double >,
+ *    grb::identities::zero
+ *  >
+ * \endcode
+ * Note that passing a single domain as a template argument to a binary operator
+ * is a short-hand for an operator with \f$ D_{\{1,2,3\}} \f$ equal to the same
+ * domain.
+ *
+ * Likewise, a #grb::Semiring is composed from two monoids, where the first,
+ * the so-called additive monoid, furthermore must be commutative. The classic
+ * semiring over integers taught in elementary school, for example, reads
+ * \code
+ *  grb::Semiring<
+ *    grb::operators::add< unsigned int >,
+ *    grb::operators::mul< unsigned int >,
+ *    grb::identities::zero,
+ *    grb::identities::one
+ *  >
+ * \endcode
  *
  * Monoids and semirings must comply with their regular axioms-- a type system
  * assists users by checking for incorrect operators acting as additive or
- * multiplicative operators. Standard operators and identities are found in
- * their respective namespaces, #grb::operators and #grb::identities,
- * respectively.
+ * multiplicative monoids. Errors are reported <em>at compile time</em>, through
+ * the use of <em>algebraic type traits</em> such as #grb::is_associative.
  *
- * Monoids and semirings must be supplied with the domain(s) it will operate
- * on. These must be available at compile time. Also the element type of
- * GraphBLAS containers must be set at compile time. The size of a container is
- * set at run-time, but may not change during its life time.
+ * @see typeTraits
  *
- * This implementation provides various \ref BLAS1 and \ref BLAS2 primitives. To
- * simplify writing generalised algebraic routines, it also provides \ref BLAS0
- * primitives.
+ * Standard operators and identities are found in their respective namespaces,
+ * #grb::operators and #grb::identities, respectively. The ALP monoids and
+ * semirings are generalised from their standard mathematical definitions in
+ * that they hold multiple domains. The description of #grb::Semiring details
+ * the underlying mathematical structure that nevertheless can be identified.
+ * \endparblock
  *
- * The three aforementioned ingredients, namely, containers, algebraic relations
- * (such as semirings), and level-{1,2,3} primitives make up the full interface
- * of this DSL.
+ * \parblock
+ * \par ALP/GraphBLAS by example
  *
  * An example is provided within examples/sp.cpp. It demonstrates usage of this
  * API. We now follow with some code snippets from that example. First, the
@@ -104,64 +229,249 @@
  * Full example use case:
  *
  * \snippet sp.cpp Example shortest-paths with semiring adapted to find the most reliable route instead
+ * \endparblock
+ *
+ * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
+ *
+ * \defgroup typeTraits Algebraic Type Traits
+ * @{
  *
- * Any GraphBLAS code may execute using any of the backends this implementation
- * defines. Currently, the following backends are stable:
+ * Algebraic type traits allows compile-time reasoning on algebraic structures.
+ *
+ * Under <em>algebraic type traits</em>, ALP defines two classes of type traits:
+ *  1. classical type traits, akin to, e.g., <tt>std::is_integral</tt>, defined
+ *     over the ALP-specific algebraic objects such as #grb::Semiring, and
+ *  2. algebraic type traits that allow for the compile-time introspection of
+ *     algebraic structures.
+ *
+ * Under the first class, the following type traits are defined by ALP:
+ *  - #grb::is_operator, #grb::is_monoid, and #grb::is_semiring, but also
+ *  - #grb::is_container and #grb::is_object.
+ *
+ * Under the second class, the following type traits are defined by ALP:
+ *  - #grb::is_associative, #grb::is_commutative, #grb::is_idempotent, and
+ *    #grb::has_immutable_nonzeroes.
+ *
+ * Algebraic type traits are a central concept to ALP; depending on algebraic
+ * properties, ALP applies different optimisations. Properties such as
+ * associativity furthermore often define whether primitives may be
+ * automatically parallelised. Therefore, some primitives only allow algebraic
+ * structures with certain properties.
+ *
+ * Since algebraic type traits are compile-time, the composition of invalid
+ * structures (e.g., composing a monoid out of a non-associative binary
+ * operator), or the calling of a primitive using an incompatible algebraic
+ * structure, results in an <em>compile-time</em> error. Such errors are
+ * furthermore accompanied by clear messages and suggestions.
+ *
+ * @}
+ *
+ * \defgroup backends Backends
+ * @{
+ *
+ * ALP code is compiled using a compiler wrapper, which optionally takes a
+ * backend parameter as an argument. The backend selection controls for which
+ * use case the code is compiled. Options that are always included are:
  *   -# #grb::reference, a single-process, auto-vectorising, sequential backend;
  *   -# #grb::reference_omp, a single-process, auto-parallelising, shared-memory
  *      parallel backend based on OpenMP and the aforementioned vectorising
  *      backend;
+ *   -# grb::hyperdags, a backend that captures the meta-data of computations
+ *      while delegating the actual work to the #grb::reference backend. At
+ *      program exit, the #grb::hyperdags backend dumps a HyperDAG of the
+ *      computations performed.
+ *
+ * Additionally, the following backends may be enabled by providing their
+ * dependences before building ALP:
  *   -# #grb::BSP1D, an auto-parallelising, distributed-memory parallel
  *      backend based on the Lightweight Parallel Foundations (LPF). This is a
  *      multi-process backend and may rely on any single-process backend for
- *      process-local computations. Its combination with the #grb::reference_omp
+ *      process-local computations, which by default is #grb::reference.
+ *      Distributed-memory auto-parallelisation is achieved using a row-wise
+ *      one-dimensional block-cyclic distributon.
+ *      Its combination with the #grb::reference_omp
  *      backend results in a fully hybrid shared- and distributed-memory
  *      GraphBLAS implementation.
- *
- * Backends that are currently under development:
+ *   -# #grb::hybrid, essentially the same backend as #grb::BSP1D, but now
+ *      composed with the #grb::reference_omp backend for process-local
+ *      computations. This backend facilitates full hybrid shared- and
+ *      distributed-memory parallelisation.
  *   -# #grb::banshee, a single-process, reference-based backend for the Banshee
  *      RISC-V hardware simulator making use of indirection stream semantic
- *      registers (ISSR, in collaboration with Prof. Benini at ETHZ);
+ *      registers (ISSR). Written by Dan Iorga in collaboration with ETHZ. This
+ *      backend is outdated, but, last tested, remained functional.
+ *
+ * The #grb::Backend enum lists all backends known to ALP. Properties of a
+ * backend that may affect more advanced user code are collected in
+ * #grb::Properties.
  *
- * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
  * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
+ *
+ * \defgroup perfSemantics Performance Semantics
+ * @{
+ *
+ * Each ALP primitive, every constructor, and every destructor come with
+ * <em>performance semantics</em>, in addition to functional semantics.
+ *
+ * Performance semantics may differ for different backends-- ALP stringently
+ * mandates that backends defines them, thus imposing a significant degree of
+ * predictability on implementations of ALP, but does not significantly limit
+ * possible implementation choices.
+ *
+ * \warning Performance semantics should not be mistaken for performance
+ *          \em guarantees. The vast majority of computing platforms exhibit
+ *          performance variabilities that preclude defining stringent such
+ *          guarantees.
+ *
+ * Performance semantics includes classical asymptotic work analysis in the
+ * style of Cormen et alii, as commonly taught as part of basic computer science
+ * courses. Aside from making the reasonable (although arguably too uncommon)
+ * demand that ALP libraries must clearly document the work complexity of the
+ * primitives it defines, ALP furthermore demands such analyses for the
+ * following quantities:
+ *  - how many times operator(s) may be applied,
+ *  - intra-process data movement from main memory to processing units,
+ *  - new dynamic memory allocations and/or releases of previously allocated
+ *     memory, and
+ *  - whether system calls may occur during a call to the given primitive.
+ *
+ * \note Typically (but not always) the amount of work is proportional to the
+ *       number of operator applications.
+ *
+ * \note Typically (but not necessarily always) if primitives are allowed to
+ *       allocate or free dynamic memory, then it may also thus make system
+ *       calls.
+ *
+ * For backends that allow for more than one user process, the following
+ * additional performance semantics must be defined:
+ *  - inter-process data movement, and
+ *  - how many synchronisation steps a primitive requires to complete.
+ *
+ * Defining such performance semantics are crucial to
+ *  1. allow algorithm designers to design the best possible algorithms even if
+ *     the target platforms and target use cases vary,
+ *  2. allow users to determine scalability under increasing problem sizes, and
+ *  3. allow system architects to determine the qualitative effect of scaling up
+ *     system resources in an a-priori fashion.
+ *
+ * These advantages furthermore do not require expensive experimentation on the
+ * part of algorithm designers, users, or system architects. However, it puts a
+ * significant demand on the implementers and maintainers of ALP.
+ *
+ * @see backends
+ *
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
  */
 
 #ifdef __DOXYGEN__
+
 /**
- * Define this macro to disable libnuma use.
+ * Define this macro to disable the dependence on libnuma.
+ *
+ * \warning Defining this macro is discouraged and not tested thoroughly.
+ *
+ * \note The CMake bootstrap treats libnuma as a non-optional dependence.
  */
 #define _GRB_NO_LIBNUMA
 
 /**
+ * \internal
  * Define this macro to disable thread pinning.
+ * \todo Make sure this macro is taken into account for backends that perform
+ *       automatic pinning.
+ * \endinternal
  */
 #define _GRB_NO_PINNING
 
 /**
- * Defie this macro to compile with PlatformBSP support.
+ * Define this macro to turn off standard input/output support.
+ *
+ * \warning This macro has only been fully supported within the #grb::banshee
+ *          backend, where neither standard <tt>iostream</tt> nor
+ *          <tt>stdio.h</tt> were available. If support through the full ALP
+ *          implementation would be useful, please raise an issue through
+ *          GitHub or Gitee so that we may consider and plan for supporting
+ *          this macro more fully.
  */
-#define _GRB_WITH_LPF
+#define _GRB_NO_STDIO
 
 /**
- * Which GraphBLAS backend should be default.
+ * Define this macro to turn off reliance on standard C++ exceptions.
+ *
+ * \deprecated Support for this macro is being phased out.
+ *
+ * \note Its intended use is to support ALP/GraphBLAS deployments on platforms
+ *       that do not support C++ exceptions, such as some older Android SDK
+ *       applications.
  *
- * Known single user-process options:
- *  -# reference
- *  -# reference_omp
+ * \warning The safe usage of ALP/GraphBLAS while exceptions are disabled
+ *          relies, at present, on the inspection of internal states and the
+ *          usage of internal functions. We have no standardised exception-free
+ *          way of using ALP/GraphBLAS at present and have no plans to
+ *          (continue and/or extend) support for it.
+ */
+#define _GRB_NO_EXCEPTIONS
+
+/**
+ * Define this macro to compile with LPF support.
+ *
+ * \note The CMake bootstrap automatically defines this flag when a valid LPF
+ *       installation is found. This flag is also defined by the ALP/GraphBLAS
+ *       compiler wrapper whenever an LPF-enabled backend is selected.
+ */
+#define _GRB_WITH_LPF
+
+/**
+ * \internal
+ * Which ALP/GraphBLAS backend should be the default.
  *
- * Known multiple user-process options:
- *  -# BSP1D
+ * This flag is overridden by the compiler wrapper, and it is set by the base
+ * config.hpp header.
+ * \endinternal
  */
 #define _GRB_BACKEND reference
 
 /**
- * Which GraphBLAS backend the BSP1D backend should use within a single user
- * process. For possible values, see the single user process options for
- * #_GRB_BACKEND.
+ * Which ALP/GraphBLAS backend the BSP1D backend should use for computations
+ * within a single user process. The ALP/GraphBLAS compiler wrapper sets this
+ * value automatically depending on the choice of backend-- compare, e.g., the
+ * #grb::BSP1D backend versus the #grb::hybrid backend.
  */
 #define _GRB_BSP1D_BACKEND
-#endif
+
+/**
+ * The ALP/GraphBLAS namespace.
+ *
+ * All ALP/GraphBLAS primitives, container types, algebraic structures, and type
+ * traits are defined within.
+ */
+namespace grb {
+
+	/**
+	 * The namespace for ALP/GraphBLAS algorithms.
+	 */
+	namespace algorithms {
+
+		/**
+		 * The namespace for ALP/Pregel algorithms.
+		 */
+		namespace pregel {}
+
+	}
+
+	/**
+	 * The namespace for programming APIs that automatically translate to
+	 * ALP/GraphBLAS.
+	 */
+	namespace interfaces {}
+
+}
+
+#endif // end ``#ifdef __DOXYGEN__''
 
 #ifndef _H_GRAPHBLAS
 #define _H_GRAPHBLAS
diff --git a/include/graphblas/algorithms/bicgstab.hpp b/include/graphblas/algorithms/bicgstab.hpp
index a4f338156..289a53554 100644
--- a/include/graphblas/algorithms/bicgstab.hpp
+++ b/include/graphblas/algorithms/bicgstab.hpp
@@ -15,11 +15,17 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the BiCGstab algorithm.
+ *
  * @author A. N. Yzelman
  * @date 15th of February, 2022
  *
- * Implementation time, to be taken with a pinch of salt:
+ * \par Implementation time
+ *
+ * To be taken with a pinch of salt, as it is highly subjective:
  *  - 50 minutes, excluding error handling, documentation, and testing.
  *  - 10 minutes to get it to compile, once the smoke test was generated.
  *  - 15 minutes to incorporate proper error handling plus printing of warnings
@@ -87,12 +93,12 @@ namespace grb {
 		 *
 		 * Additional outputs of this algorithm:
 		 *
-		 * @param[out]    iterations When #grb::SUCCESS is returned, the number of
-		 *                           iterations that were required to obtain an
-		 *                           acceptable approximate solution.
-		 * @param[out]    residual   When #grb::SUCCESS is returned, the square of the
-		 *                           2-norm of the residual; i.e., \f$ (r,r) \f$,
-		 *                           where \f$ r = b - Ax \f$.
+		 * @param[out] iterations When #grb::SUCCESS is returned, the number of
+		 *                        iterations that were required to obtain an
+		 *                        acceptable approximate solution.
+		 * @param[out] residual   When #grb::SUCCESS is returned, the square of the
+		 *                        2-norm of the residual; i.e., \f$ (r,r) \f$,
+		 *                        where \f$ r = b - Ax \f$.
 		 *
 		 * To operate, this algorithm requires a workspace consisting of six vectors
 		 * of length and capacity \f$ n \f$. If vectors with less capacity are passed
@@ -100,6 +106,18 @@ namespace grb {
 		 *
 		 * @param[in] r, rhat, p, v, s, t Workspace vectors required for BiCGstab.
 		 *
+		 * The BiCGstab operates over a field defined by the following algebraic
+		 * structures:
+		 *
+		 * @param[in] semiring Defines the domains as well as the additive and the
+		 *                     multicative monoid.
+		 * @param[in] minus    The inverse of the additive operator.
+		 * @param[in] divide   The inverse of the multiplicative operator.
+		 *
+		 * \note When compiling with the <tt>_DEBUG</tt> macro defined, the print-out
+		 *       statements require <tt>sqrt</tt> as an additional algebraic concept.
+		 *       This concept presently lives "outside" of ALP.
+		 *
 		 * Valid descriptors to this algorithm are:
 		 *   -# descriptors::no_casting
 		 *   -# descriptors::transpose
@@ -120,6 +138,7 @@ namespace grb {
 		 *                         output as well as the state of ALP/GraphBLAS is
 		 *                         undefined.
 		 *
+		 * \parblock
 		 * \par Performance semantics
 		 *
 		 *   -# This function does not allocate nor free dynamic memory, nor shall it
@@ -130,8 +149,10 @@ namespace grb {
 		 * the specification of the ALP primitives this function relies on. These
 		 * performance semantics, with the exception of getters such as #grb::nnz, are
 		 * specific to the backend selected during compilation.
+		 * \endparblock
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename IOType, typename NonzeroType, typename InputType,
 			typename ResidualType,
 			class Semiring = Semiring<
@@ -142,7 +163,8 @@ namespace grb {
 			class Minus = operators::subtract< ResidualType >,
 			class Divide = operators::divide< ResidualType >
 		>
-		RC bicgstab( grb::Vector< IOType > &x,
+		RC bicgstab(
+			grb::Vector< IOType > &x,
 			const grb::Matrix< NonzeroType > &A,
 			const grb::Vector< InputType > &b,
 			const size_t max_iterations,
@@ -330,13 +352,13 @@ namespace grb {
 
 				// p = r + beta ( p - omega * v )
 				ret = ret ? ret : eWiseLambda(
-					[&r,&beta,&p,&v,&omega,&semiring,&minus] (const size_t i) {
+					[&r,beta,&p,&v,omega,&semiring,&minus] (const size_t i) {
 						InputType tmp;
 						apply( tmp, omega, v[i], semiring.getMultiplicativeOperator() );
 						foldl( p[ i ], tmp, minus );
 						foldr( beta, p[ i ], semiring.getMultiplicativeOperator() );
 						foldr( r[ i ], p[ i ], semiring.getAdditiveOperator() );
-					}, v, b
+					}, v, p, r
 				);
 
 				// v = Ap
@@ -371,9 +393,10 @@ namespace grb {
 				// check residual
 				residual = zero;
 				ret = ret ? ret : dot< dense_descr >( residual, s, s, semiring );
-				assert( residual > zero ); // we just assert this one rather than checking for it
+				assert( residual > zero );
 #ifdef _DEBUG
-				std::cout << "\t\t running residual, pre-stabilisation: " << sqrt(residual) << "\n";
+				std::cout << "\t\t running residual, pre-stabilisation: " << sqrt(residual)
+					<< "\n";
 #endif
 				if( ret == SUCCESS && residual < tol ) {
 					// update result (x += alpha * p) and exit
@@ -400,7 +423,7 @@ namespace grb {
 #ifdef _DEBUG
 				std::cout << "\t\t (t, t) = " << omega << "\n";
 #endif
-				assert( omega > zero ); // we just assert this one rather than checking for it
+				assert( omega > zero );
 				ret = ret ? ret : foldr( temp, omega, divide );
 #ifdef _DEBUG
 				std::cout << "\t\t omega = " << omega << "\n";
@@ -421,9 +444,10 @@ namespace grb {
 				// check residual
 				residual = zero;
 				ret = ret ? ret : dot< dense_descr >( residual, r, r, semiring );
-				assert( residual > zero ); // we just assert this one rather than checking for it
+				assert( residual > zero );
 #ifdef _DEBUG
-				std::cout << "\t\t running residual, post-stabilisation: " << sqrt(residual) << ". "
+				std::cout << "\t\t running residual, post-stabilisation: "
+					<< sqrt(residual) << ". "
 					<< "Residual squared: " << residual << ".\n";
 #endif
 				if( ret == SUCCESS ) {
diff --git a/include/graphblas/algorithms/conjugate_gradient.hpp b/include/graphblas/algorithms/conjugate_gradient.hpp
index 7ed2f3888..9a68f248e 100644
--- a/include/graphblas/algorithms/conjugate_gradient.hpp
+++ b/include/graphblas/algorithms/conjugate_gradient.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the CG algorithm
+ *
  * @author Aristeidis Mastoras
  */
 
@@ -85,7 +89,7 @@ namespace grb {
 		 * Additional outputs (besides \a x):
 		 *
 		 * @param[out]    iterations     The number of iterations the algorithm has
-		 *                               performed.
+		 *                               started.
 		 * @param[out]    residual       The residual corresponding to output \a x.
 		 *
 		 * The CG algorithm requires three workspace buffers with capacity \f$ n \f$:
@@ -118,6 +122,17 @@ namespace grb {
 		 *                         output as well as the state of ALP/GraphBLAS is
 		 *                         undefined.
 		 *
+		 * On output, the contents of the workspace \a r, \a u, and \a temp are
+		 * always undefined. For non-#grb::SUCCESS error codes, additional containers
+		 * or states may be left undefined:
+		 * -# when #grb::PANIC is returned, the entire program state, including the
+		 *    contents of all containers, become undefined;
+		 * -# when #grb::ILLEGAL or #grb::MISMATCH are returned and \a iterations
+		 *    equals zero, then all outputs are left unmodified compared to their
+		 *    contents at function entry;
+		 * -# when #grb::ILLEGAL or #grb::MISMATCH are returned and \a iterations is
+		 *    nonzero, then the contents of \a x are undefined.
+		 *
 		 * \par Performance semantics
 		 *
 		 *   -# This function does not allocate nor free dynamic memory, nor shall it
@@ -234,6 +249,15 @@ namespace grb {
 				}
 			}
 
+			// set pure output fields to neutral defaults
+			iterations = 0;
+			residual = std::numeric_limits< double >::infinity();
+
+			// trivial shortcuts
+			if( max_iterations == 0 ) {
+				return FAILED;
+			}
+
 			// make x and b structurally dense (if not already) so that the remainder
 			// algorithm can safely use the dense descriptor for faster operations
 			{
@@ -283,7 +307,7 @@ namespace grb {
 			} else {
 				ret = ret ? ret : grb::dot< descr_dense >( sigma, r, r, ring );
 			}
-			
+
 			assert( ret == SUCCESS );
 
 			// bnorm = b' * b;
@@ -306,6 +330,9 @@ namespace grb {
 			size_t iter = 0;
 
 			do {
+				assert( iter < max_iterations );
+				(void) ++iter;
+
 				// temp = 0
 				ret = ret ? ret : grb::set( temp, 0 );
 				assert( ret == SUCCESS );
@@ -363,7 +390,7 @@ namespace grb {
 				assert( ret == SUCCESS );
 
 				if( ret == SUCCESS ) {
-					if( sqrt( residual ) < tol ) {
+					if( sqrt( residual ) < tol || iter >= max_iterations ) {
 						break;
 					}
 				}
@@ -383,17 +410,19 @@ namespace grb {
 				std::swap( u, temp );
 
 				sigma = beta;
+			} while( ret == SUCCESS );
 
-			} while( iter++ < max_iterations && ret == SUCCESS );
-
-			// output
+			// output that is independent of error code
 			iterations = iter;
 
-			if( ret != SUCCESS ) {
-				return FAILED;
-			} else {
-				return SUCCESS;
+			// return correct error code
+			if( ret == SUCCESS ) {
+				if( sqrt( residual ) >= tol ) {
+					// did not converge within iterations
+					return FAILED;
+				}
 			}
+			return ret;
 		}
 
 	} // namespace algorithms
diff --git a/include/graphblas/algorithms/cosine_similarity.hpp b/include/graphblas/algorithms/cosine_similarity.hpp
index a51f16bc6..6dff53af2 100644
--- a/include/graphblas/algorithms/cosine_similarity.hpp
+++ b/include/graphblas/algorithms/cosine_similarity.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements cosine simularity
+ *
  * @author: A. N. Yzelman.
  * @date: 13th of December, 2017.
  */
@@ -77,14 +81,16 @@ namespace grb {
 		 * The argument \a div is optional. It will map to grb::operators::divide by
 		 * default.
 		 *
-		 * @returns SUCCESS  If the computation was successful.
-		 * @returns MISMATCH If the vector sizes do not match. The output
-		 *                   \a similarity is undefined.
-		 * @returns ILLEGAL  In case \a x is all zero, and/or when \a y is all zero.
-		 *                   The output \a similarity is undefined.
-		 * @returns PANIC    If an unrecoverable error has been encountered. The
-		 *                   output as well as the state of ALP/GraphBLAS is
-		 *                   undefined.
+		 * @returns #grb::SUCCESS  If the computation was successful.
+		 * @returns #grb::MISMATCH If the vector sizes do not match. The output
+		 *                         \a similarity is untouched -- the call to this
+		 *                         algorithm will have no other effects than returning
+		 *                         #grb::MISMATCH.
+		 * @returns #grb::ILLEGAL  In case \a x is all zero, and/or when \a y is all zero.
+		 *                         The output \a similarity is undefined.
+		 * @returns #grb::PANIC    If an unrecoverable error has been encountered. The
+		 *                         output as well as the state of ALP/GraphBLAS is
+		 *                         undefined.
 		 *
 		 * \par Performance semantics
 		 *
@@ -97,7 +103,8 @@ namespace grb {
 		 * performance semantics, with the exception of getters such as #grb::nnz, are
 		 * specific to the backend selected during compilation.
 		 */
-		template< Descriptor descr = descriptors::no_operation,
+		template<
+			Descriptor descr = descriptors::no_operation,
 			typename OutputType,
 			typename InputType1,
 			typename InputType2,
@@ -161,14 +168,14 @@ namespace grb {
 						const auto &mul = ring.getMultiplicativeOperator();
 						const auto &add = ring.getAdditiveOperator();
 						OutputType temp;
-						(void)grb::apply( temp, x[ i ], y[ i ], mul );
-						(void)grb::foldl( nominator, temp, add );
-						(void)grb::apply( temp, x[ i ], x[ i ], mul );
-						(void)grb::foldl( norm1, temp, add );
-						(void)grb::apply( temp, y[ i ], y[ i ], mul );
-						(void)grb::foldl( norm2, temp, add );
-					},
-					x, y );
+						(void) grb::apply( temp, x[ i ], y[ i ], mul );
+						(void) grb::foldl( nominator, temp, add );
+						(void) grb::apply( temp, x[ i ], x[ i ], mul );
+						(void) grb::foldl( norm1, temp, add );
+						(void) grb::apply( temp, y[ i ], y[ i ], mul );
+						(void) grb::foldl( norm2, temp, add );
+					}, x, y
+				);
 				denominator = sqrt( norm1 ) * sqrt( norm2 );
 			} else {
 				// cannot stream each vector once, stream each one twice instead using
diff --git a/include/graphblas/algorithms/kcore_decomposition.hpp b/include/graphblas/algorithms/kcore_decomposition.hpp
new file mode 100644
index 000000000..e17fcc5f3
--- /dev/null
+++ b/include/graphblas/algorithms/kcore_decomposition.hpp
@@ -0,0 +1,296 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the algebraic k-core decomposition algorithm by Li et al.
+ *
+ * @author Anders Hansson
+ * @date January, 2023
+ */
+
+
+#ifndef _H_GRB_KCORE_DECOMPOSITION
+#define _H_GRB_KCORE_DECOMPOSITION
+
+#include <graphblas.hpp>
+
+
+namespace grb {
+
+	namespace algorithms {
+
+		/**
+		 * The \f$ k \f$-core decomposition algorithm.
+		 *
+		 * \note This algorithm is smoke-tested using a ground-truth output coreness
+		 *       vector corresponding to the EPA matrix. However, the ground truth
+		 *       was generated using an earlier version of this algorithm, run using
+		 *       an earlier version of ALP/GraphBLAS. This solution was manually
+		 *       verified against an external algorithm. A better testing methodology
+		 *       compares against a ground truth generated by such an external
+		 *       baseline-- see GitHub issue #160, to which contributions would be
+		 *       warmly received.
+		 *
+		 * Divides the input matrix into subgraphs with a coreness level. The coreness
+		 * level \f$ k \f$ is defined as the largest subgraph in which each node has at
+		 * least \f$ k \f$ neighbors in the subgraph.
+		 *
+		 * @tparam IOType   The value type of the \f$ k \f$-core vectors,
+		 *                  usually an integer type.
+		 * @tparam NZType   The type of the nonzero elements in the matrix.
+		 *
+		 * @param[in] A     Matrix representing a graph with nonzero value at
+		 *                  \f$ (i, j) \f$ an edge between node \f$ i \f$ and
+		 *                  \f$ j \f$.
+		 * @param[out] core Empty vector of size and capacity \f$ n \f$. On
+		 *                  output, if #grb::SUCCESS is returned, stores the
+		 *                  coreness level for each node.
+		 * @param[out] k    The number of coreness lever that was found in the
+		 *                  graph.
+		 *
+		 * To operate, this algorithm requires a workspace of four vectors. The size
+		 * \em and capacities of these must equal \f$ n \f$. The contents on input are
+		 * ignored, and the contents on output are undefined. The work space consists
+		 * of the buffer vectors \a distances, \a temp, \a update, and \a status.
+		 *
+		 * @param[in,out] distances Distance buffer
+		 * @param[in,out] temp      First node update buffer
+		 * @param[in,out] update    Second node update buffer
+		 * @param[in,out] status    Finished/unfinished buffer
+		 *
+		 * @returns #grb::SUCCESS  If the coreness for all nodes are found.
+		 * @returns #grb::ILLEGAL  If \a A is not square. All outputs are left
+		 *                         untouched.
+		 * @returns #grb::MISMATCH If the dimensions of \a core or any of the buffer
+		 *                         vectors does not match \a A. All outputs are left
+		 *                         untouched.
+		 * @returns #grb::ILLEGAL  If the capacity of one or more of \a core and the
+		 *                         buffer vectors is less than \f$ n \f$.
+		 * @returns #grb::PANIC    If an unrecoverable error has been encountered. The
+		 *                         output as well as the state of ALP/GraphBLAS is
+		 *                         undefined.
+		 *
+		 * If any non #grb::SUCCESS error code is returned, then the contents of
+		 * \a core are undefined, while \a k will be untouched by the algorithm.
+		 *
+		 * \note For undirected, unweighted graphs, use pattern matrix for \a A;
+		 *       i.e., use \a NZtype <tt>void</tt>
+		 *
+		 * \note For unweighted graphs, IOType should be a form of unsigned integer.
+		 *       The value of any IOType element will be no more than the maximum
+		 *       degree found in the graph \a A.
+		 *
+		 * @tparam criticalSection The original MR had an eWiseLambda-based
+		 *                         implementation that contains a critical section.
+		 *                         This may or may not be faster than a pure
+		 *                         ALP/GraphBLAS implementation, depending also on
+		 *                         which backend is selected. Setting this template
+		 *                         argument <tt>true</tt> selects the original
+		 *                         eWiseLambda-based implementation, while otherwise
+		 *                         a pure ALP/GraphBLAS implementation takes effect.
+		 *
+		 * \note In some non-exhaustive experiments, setting \a criticalSection to
+		 *       <tt>false</tt> leads to better performance on shared-memory parallel
+		 *       systems (using #grb::reference_omp).
+		 *
+		 * \warning Setting \a criticalSection to <tt>true</tt> is not supported for
+		 *          the distributed-memory backends #grb::BSP1D and #grb::hybrid; see
+		 *          the corresponding code comment in the below algorithm for details.
+		 *
+		 * For the above considerations, the default for \a criticalSection is
+		 * presently set to <tt>false</tt>.
+		 *
+		 * \parblock
+		 * \par Performance semantics
+		 *
+		 *   -# This function does not allocate nor free dynamic memory, nor shall it
+		 *      make any system calls.
+		 *
+		 * For additional performance semantics regarding work, inter-process data
+		 * movement, intra-process data movement, synchronisations, and memory use,
+		 * please see the specification of the ALP primitives this function relies on.
+		 * These performance semantics, with the exception of getters such as
+		 * #grb::nnz, are specific to the backend selected during compilation.
+		 * \endparblock
+		 *
+		 * This algorithm is modelled after Li et al., "The K-Core Decomposition
+		 * Algorithm Under the Framework of GraphBLAS", 2021 IEEE High Performance
+		 * Extreme Computing Conference (HPEC), doi: 10.1109/HPEC49654.2021.9622845.
+		 */
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool criticalSection = false,
+			typename IOType, typename NZType
+		>
+		RC kcore_decomposition(
+			const Matrix< NZType > &A,
+			Vector< IOType > &core,
+			Vector< IOType > &distances,
+			Vector< IOType > &temp,
+			Vector< IOType > &update,
+			Vector< bool >   &status,
+			IOType &k
+		) {
+			// Add constants/expressions
+			Semiring<
+				operators::add< IOType >, operators::mul< IOType >,
+				identities::zero, identities::one
+			> ring;
+			Monoid<
+				operators::logical_or< bool >,
+				identities::logical_false
+			> lorMonoid;
+
+			// Runtime sanity checks
+			const size_t n = nrows(A);
+			{
+				// Verify that A is square
+				if( n != ncols( A )){
+					return ILLEGAL;
+				}
+				// Verify sizes of vectors
+				if( size( core ) != n ||
+					size( distances ) != n ||
+					size( temp ) != n ||
+					size( update ) != n ||
+					size( status ) != n
+				) {
+					return MISMATCH;
+				}
+				// Verify capacity
+				if( capacity( core ) != n ||
+					capacity( distances ) != n ||
+					capacity( temp ) != n ||
+					capacity( update ) != n ||
+					capacity( status ) != n
+				) {
+					return ILLEGAL;
+				}
+			}
+
+			// Initialise
+			IOType current_k = 0; // current coreness level
+
+			// Set initial values
+			RC ret = grb::SUCCESS;
+			ret = ret ? ret : set( temp, static_cast< IOType >( 1 ) );
+			ret = ret ? ret : set( distances,  static_cast< IOType >( 0 ) );
+			ret = ret ? ret : set( core,  static_cast< IOType >( 0 ) );
+			ret = ret ? ret : set( status, true );
+			ret = ret ? ret : clear( update );
+			assert( ret == SUCCESS );
+
+			ret = ret ? ret : grb::mxv< descr | descriptors::dense >(
+				distances, A, temp, ring );
+			assert( ret == SUCCESS );
+
+			if( SUCCESS != ret ) {
+				std::cerr << " Initialization of k-core decomposition failed with error "
+					<< grb::toString( ret ) << "\n";
+				return ret;
+			}
+
+			size_t count = 0;
+			while( count < n && SUCCESS == ret ) {
+				bool flag = true;
+
+				// Update filter to exclude completed nodes
+				ret = ret ? ret : set( update, status, status );
+
+				while( flag ) {
+					flag = false;
+
+					// Update nodes in parallel
+					if( criticalSection ) {
+						ret = ret ? ret : clear( temp );
+						ret = ret ? ret : eWiseLambda( [ &, current_k ]( const size_t i ) {
+								if( status[ i ] && distances[ i ] <= current_k ) {
+									core[ i ] = current_k;
+									// Remove node from checking
+									status[ i ] = false;
+									// Set update
+									flag = true;
+									#pragma omp critical
+									{
+										// Add node index to update neighbours
+										setElement( temp, 1, i );
+									}
+								}
+							}, update,
+							status, distances, core, temp
+						);
+						// WARN: even with the below, this variant does not auto-parallelise in
+						//       the distributed-memory sense. The reason is a performance
+						//       contract violation by the above critical section -- setElement
+						//       should be a collective call, but its use from within eWiseLambda
+						//       does not ensure a collective call. The result is that PANIC will
+						//       at some point be returned.
+						//ret = ret ? ret : collectives<>::allreduce( flag,
+						//	lorMonoid.getOperator() );
+					} else {
+						ret = ret ? ret : eWiseApply( temp, status, distances, current_k,
+							operators::leq< IOType >() );
+						ret = ret ? ret : foldl( core, temp, current_k,
+							operators::right_assign< IOType >() );
+						ret = ret ? ret : foldl( status, temp, false,
+							operators::right_assign< bool >() );
+						ret = ret ? ret : foldl( flag, temp, lorMonoid );
+						ret = ret ? ret : set( update, temp, 1 );
+						if( ret == SUCCESS ) {
+							std::swap( update, temp );
+						}
+					}
+					assert( ret == SUCCESS );
+
+					if( ret == SUCCESS && flag ) {
+						ret = clear( update );
+						assert( ret == SUCCESS );
+
+						// Increase number of nodes completed
+						count += nnz( temp );
+
+						// Get the neighbours of the updated nodes
+						ret = ret ? ret : grb::mxv< descr >( update, A, temp, ring );
+						assert( ret == SUCCESS );
+
+						// Decrease distances of the neighbours
+						ret = ret ? ret : grb::eWiseApply( distances, distances, update,
+							operators::subtract< IOType >() );
+						assert( ret == SUCCESS );
+					}
+				}
+				(void) ++current_k;
+			}
+
+			if( SUCCESS != ret ){
+				std::cerr << " Excecution of k-core decomposition failed with error "
+					<< grb::toString(ret) << "\n";
+			} else {
+				k = current_k;
+			}
+
+			return ret;
+		}
+
+	} // namespace algorithms
+
+} // namespace grb
+
+#endif // end _H_GRB_KCORE_DECOMPOSITION
+
diff --git a/include/graphblas/algorithms/kmeans.hpp b/include/graphblas/algorithms/kmeans.hpp
index b94b09e90..061d1e0d0 100644
--- a/include/graphblas/algorithms/kmeans.hpp
+++ b/include/graphblas/algorithms/kmeans.hpp
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements k-means. The state of the algorithms defined within are
+ * \em experimental.
+ *
  * @author Verner Vlacic
  */
 
@@ -40,8 +45,8 @@ namespace grb {
 		 * @param[in,out] K k by m matrix containing the current k means as row vectors
 		 * @param[in]     X m by n matrix containing the n points to be classified as
 		 *                  column vectors
-		 * @param[in]    op coordinatewise distance operator, squared difference by
-		 *                  default
+		 * @param[in] dist_op Coordinatewise distance operator, squared difference by
+		 *                    default
 		 *
 		 * \todo more efficient implementation using Walker's alias method
 		 *
@@ -52,14 +57,20 @@ namespace grb {
 			typename IOType = double,
 			class Operator = operators::square_diff< IOType, IOType, IOType >
 		>
-		RC kpp_initialisation( Matrix< IOType > &K, const Matrix< IOType > &X,
+		RC kpp_initialisation(
+			Matrix< IOType > &K,
+			const Matrix< IOType > &X,
 			const Operator &dist_op = Operator()
 		) {
 			// declare monoids and semirings
 			Monoid< grb::operators::add< IOType >, grb::identities::zero > add_monoid;
-			Monoid< grb::operators::min< IOType >, grb::identities::infinity > min_monoid;
+			Monoid<
+				grb::operators::min< IOType >,
+				grb::identities::infinity
+			> min_monoid;
 			Semiring<
-				grb::operators::add< IOType >, grb::operators::right_assign_if< bool, IOType, IOType >,
+				grb::operators::add< IOType >,
+				grb::operators::right_assign_if< bool, IOType, IOType >,
 				grb::identities::zero, grb::identities::logical_true
 			> pattern_sum;
 
@@ -117,23 +128,30 @@ namespace grb {
 
 				ret = ret ? ret : grb::setElement( col_select, true, i );
 
-				ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >( selected, col_select, X, pattern_sum );
+				ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >(
+					selected, col_select, X, pattern_sum );
 
-				ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid, dist_op );
+				ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid,
+					dist_op );
 
-				ret = ret ? ret : grb::foldl( min_distances, selected_distances, min_monoid );
+				ret = ret ? ret : grb::foldl( min_distances, selected_distances,
+					min_monoid );
 
-				// TODO the remaining part of the loop should be replaced with the alias algorithm
+				// TODO the remaining part of the loop should be replaced with the alias
+				//      algorithm
 
 				IOType range = add_monoid.template getIdentity< IOType >();
 				ret = ret ? ret : grb::foldl( range, min_distances, add_monoid );
 
 				double sample = -1;
 				if( ret == SUCCESS ) {
-					const size_t seed = std::chrono::system_clock::now().time_since_epoch().count();
-					std::default_random_engine generator( seed );
-					std::uniform_real_distribution< double > uniform( 0, 1 );
-					sample = uniform( generator );
+					{
+						const size_t seed =
+							std::chrono::system_clock::now().time_since_epoch().count();
+						std::default_random_engine generator( seed );
+						std::uniform_real_distribution< double > uniform( 0, 1 );
+						sample = uniform( generator );
+					}
 					ret = grb::collectives<>::broadcast( sample, 0 );
 				}
 				assert( sample >= 0 );
@@ -152,7 +170,8 @@ namespace grb {
 				}
 			}
 
-			// create the matrix K by selecting the columns of X indexed by selected_indices
+			// create the matrix K by selecting the columns of X indexed by
+			// selected_indices
 
 			// declare pattern matrix
 			Matrix< void > M( k, n );
@@ -164,7 +183,8 @@ namespace grb {
 						return std::make_pair( ind, val );
 					}
 				);
-				ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(), PARALLEL );
+				ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(),
+					PARALLEL );
 			}
 
 			ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
@@ -182,23 +202,27 @@ namespace grb {
 		/**
 		 * The kmeans iteration given an initialisation
 		 *
-		 * @param[in,out] K k by m matrix containing the current k means as row vectors
+		 * @param[in,out] K k by m matrix containing the current k means as row
+		 *                  vectors
 		 * @param[in] clusters_and_distances Vector containing the class and distance
 		 *                                   to centroid for each point
 		 * @param[in] X m by n matrix containing the n points to be classified as
 		 *              column vectors
 		 * @param[in] max_iter Maximum number of iterations
-		 * @param[in] op Coordinatewise distance operator, squared difference by
-		 *               default
+		 * @param[in] dist_op Coordinatewise distance operator, squared difference by
+		 *                    default
 		 *
+		 * \internal
 		 * \todo expand documentation
+		 * \endeinternal
 		 */
 		template<
 			Descriptor descr = descriptors::no_operation,
 			typename IOType = double,
 			class Operator = operators::square_diff< IOType, IOType, IOType >
 		>
-		RC kmeans_iteration( Matrix< IOType > &K,
+		RC kmeans_iteration(
+			Matrix< IOType > &K,
 			Vector< std::pair< size_t, IOType > > &clusters_and_distances,
 			const Matrix< IOType > &X,
 			const size_t max_iter = 1000,
@@ -221,16 +245,19 @@ namespace grb {
 			> comparison_monoid;
 
 			Semiring<
-				grb::operators::add< IOType >, grb::operators::right_assign_if< bool, IOType, IOType >,
+				grb::operators::add< IOType >,
+				grb::operators::right_assign_if< bool, IOType, IOType >,
 				grb::identities::zero, grb::identities::logical_true
 			> pattern_sum;
 
 			Semiring<
-				grb::operators::add< size_t >, grb::operators::right_assign_if< size_t, size_t, size_t >,
+				grb::operators::add< size_t >,
+				grb::operators::right_assign_if< size_t, size_t, size_t >,
 				grb::identities::zero, grb::identities::logical_true
 			> pattern_count;
 
-			// runtime sanity checks: the row dimension of X should match the column dimension of K
+			// runtime sanity checks: the row dimension of X should match the column
+			// dimension of K
 			if( ncols( K ) != nrows( X ) ) {
 				return MISMATCH;
 			}
@@ -274,12 +301,12 @@ namespace grb {
 			bool converged;
 
 			do {
-				++iter;
+				(void) ++iter;
 
-				ret = ret ? ret : grb::set( clusters_and_distances_prev, clusters_and_distances );
+				ret = ret ? ret : grb::set( clusters_and_distances_prev,
+					clusters_and_distances );
 
-				ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op,
-					RESIZE );
+				ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op, RESIZE );
 				ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op );
 
 				ret = ret ? ret : vxm( clusters_and_distances, labels, Dist, argmin_monoid,
@@ -287,15 +314,15 @@ namespace grb {
 
 				auto converter = grb::utils::makeVectorToMatrixConverter<
 					void, indexIOType
-				>(
+				> (
 					clusters_and_distances,
-					[]( const size_t & ind, const indexIOType & pair ) {
+					[]( const size_t &ind, const indexIOType &pair ) {
 						return std::make_pair( pair.first, ind );
 					}
 				);
 
-				ret = ret ? ret : grb::buildMatrixUnique( M,
-					converter.begin(), converter.end(), PARALLEL );
+				ret = ret ? ret : grb::buildMatrixUnique( M, converter.begin(),
+					converter.end(), PARALLEL );
 
 				ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
 					pattern_sum, RESIZE );
diff --git a/include/graphblas/algorithms/knn.hpp b/include/graphblas/algorithms/knn.hpp
index fdffd5ba3..6df7d2fc7 100644
--- a/include/graphblas/algorithms/knn.hpp
+++ b/include/graphblas/algorithms/knn.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the \f$ k \f$-hop nearest neighbours from a given source vertex.
+ *
  * @author A. N. Yzelman
  * @date: 27th of April, 2017
  */
@@ -27,6 +31,7 @@
 
 #include <graphblas.hpp>
 
+
 namespace grb {
 
 	namespace algorithms {
@@ -48,7 +53,6 @@ namespace grb {
 		 * This algorithm requires the following workspace:
 		 *
 		 * @param[in,out] buf1 A buffer vector. Must match the size of \a A.
-		 * @param[in,out] buf2 A buffer vector. Must match the size of \a A.
 		 *
 		 * For \f$ n \times n \f$ matrices \a A, the capacity of \a u, \a buf1, and
 		 * \a buf2 must equal \f$ n \f$.
diff --git a/include/graphblas/algorithms/label.hpp b/include/graphblas/algorithms/label.hpp
index cfebc824f..a58ebad9c 100644
--- a/include/graphblas/algorithms/label.hpp
+++ b/include/graphblas/algorithms/label.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements label propagation.
+ *
  * @author J. M. Nash
  * @date 21st of March, 2017
  */
@@ -116,10 +120,11 @@ namespace grb {
 		 *     accelerating the PageRank computation', ACM Press, 2003.
 		 */
 		template< typename IOType >
-		RC label( Vector< IOType > &out,
+		RC label(
+			Vector< IOType > &out,
 			const Vector< IOType > &y, const Matrix< IOType > &W,
 			const size_t n, const size_t l,
-			const size_t MaxIterations = 1000
+			const size_t maxIterations = 1000
 		) {
 			// label propagation vectors and matrices operate over the real domain
 			Semiring<
@@ -198,7 +203,7 @@ namespace grb {
 			// compute f as P*f
 			// main loop completes when function f is stable
 			size_t iter = 1;
-			while( ret == SUCCESS && different && iter < MaxIterations ) {
+			while( ret == SUCCESS && different && iter < maxIterations ) {
 
 #ifdef _DEBUG
 				if( n < MaxAnyPrinting ) {
@@ -230,7 +235,12 @@ namespace grb {
 					<< "nnz( mask ) = " << nnz( mask ) << "\n";
 #endif
 				// clamps the first l labelled nodes
-				ret = ret ? ret : set( fNext, mask, f );
+				ret = ret ? ret : foldl(
+					fNext, mask,
+					f,
+					grb::operators::right_assign< IOType >()
+				);
+				assert( ret == SUCCESS );
 #ifdef _DEBUG
 				std::cerr << "\t post-set nnz( fNext ) = " << nnz( fNext ) << "\n";
 				printVector(
@@ -246,31 +256,36 @@ namespace grb {
 #ifdef _DEBUG
 				std::cerr << "\t pre-set  nnz(f) = " << nnz( f ) << "\n";
 #endif
-				ret = ret ? ret : set( f, fNext );
+				std::swap( f, fNext );
 #ifdef _DEBUG
 				std::cerr << "\t post-set nnz(f) = " << nnz( f ) << "\n";
 #endif
 				// go to next iteration
-				(void)++iter;
+				(void) ++iter;
 			}
 
 			if( ret == SUCCESS ) {
 				if( different ) {
 					if( s == 0 ) {
-						std::cout << "Warning: label propagation did not converge after "
+						std::cerr << "Info: label propagation did not converge after "
 							<< (iter-1) << " iterations\n";
 					}
 					return FAILED;
 				} else {
 					if( s == 0 ) {
-						std::cout << "Info: label propagation converged in "
+						std::cerr << "Info: label propagation converged in "
 							<< (iter-1) << " iterations\n";
 					}
-					return set( out, f );
+					std::swap( out, f );
+					return SUCCESS;
 				}
 			}
 
 			// done
+			if( s == 0 ) {
+				std::cerr << "Warning: label propagation exiting with " << toString(ret)
+					<< "\n";
+			}
 			return ret;
 		}
 
diff --git a/include/graphblas/algorithms/mpv.hpp b/include/graphblas/algorithms/mpv.hpp
index 22bde7cf2..78ae3a1db 100644
--- a/include/graphblas/algorithms/mpv.hpp
+++ b/include/graphblas/algorithms/mpv.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the matrix powers kernel \f$ y=A^kx \f$ over arbitrary semirings.
+ *
  * @author A. N. Yzelman
  * @date 30th of March 2017
  */
@@ -46,6 +50,7 @@ namespace grb {
 		 *                  supplied vector must match the row dimension size of \a A.
 		 * @param[in]  A    The square input matrix A. The supplied matrix must match
 		 *                  the dimensions of \a u and \a v.
+		 * @param[in]  k    How many matrix--vector multiplications are requested.
 		 * @param[in]  v    The input vector v. The supplied vector must match the
 		 *                  column dimension size of \a A. It may not be the same
 		 *                  vector as \a u.
@@ -144,7 +149,7 @@ namespace grb {
 				ret = mxv< descr >( temp, A, u, ring );
 				// check if this was the final multiplication
 				assert( iterate <= k );
-				if( iterate == k || ret != SUCCESS ) {
+				if( iterate + 1 == k || ret != SUCCESS ) {
 					break;
 				}
 				// multiply with output into u
diff --git a/include/graphblas/algorithms/norm.hpp b/include/graphblas/algorithms/norm.hpp
index 33f2df836..c74ff910a 100644
--- a/include/graphblas/algorithms/norm.hpp
+++ b/include/graphblas/algorithms/norm.hpp
@@ -15,14 +15,20 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the 2-norm.
+ *
  * @author A. N. Yzelman
  * @date 17th of March 2022
  *
+ * \internal
  * Factored out of graphblas/blas1.hpp, promoted to a (simple) algorithm since
  * semiring structures are insufficient to capture <tt>sqrt</tt>.
  *
  * \todo Provide implementations of other norms.
+ * \endinternal
  */
 
 #ifndef _H_GRB_ALGORITHMS_NORM
diff --git a/include/graphblas/algorithms/pregel_connected_components.hpp b/include/graphblas/algorithms/pregel_connected_components.hpp
new file mode 100644
index 000000000..8d134bd89
--- /dev/null
+++ b/include/graphblas/algorithms/pregel_connected_components.hpp
@@ -0,0 +1,178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the (strongly) connected components algorithm over undirected
+ * graphs using the ALP/Pregel interface.
+ *
+ * @author: A. N. Yzelman.
+ */
+
+#ifndef _H_GRB_PREGEL_CONNECTEDCOMPONENTS
+#define _H_GRB_PREGEL_CONNECTEDCOMPONENTS
+
+#include <graphblas/interfaces/pregel.hpp>
+
+
+namespace grb {
+
+	namespace algorithms {
+
+		namespace pregel {
+
+			/**
+			 * A vertex-centric Connected Components algorithm.
+			 *
+			 * @tparam VertexIDType A type large enough to assign an ID to each vertex
+			 *                      in the graph the algorithm is to run on.
+			 *
+			 * \ingroup Pregel
+			 */
+			template< typename VertexIDType >
+			struct ConnectedComponents {
+
+				/**
+				 * This vertex-centric Connected Components algorithm does not require any
+				 * algorithm parameters.
+				 */
+				struct Data {};
+
+				/**
+				 * The vertex-centric program for computing connected components. On
+				 * termination, the number of individual IDs in \a current_max_ID signifies
+				 * the number of components, while the value at each entry signifies which
+				 * component the vertex corresponds to.
+				 *
+				 * @param[in,out] current_max_ID On input: each entry is set to an unique
+				 *                               ID, corresponding to a unique ID for each
+				 *                               vertex. On output: the ID of the component
+				 *                               the corresponding vertex belongs to.
+				 * @param[in]   incoming_message A buffer for incoming messages to a vertex
+				 *                               program.
+				 * @param[in]   outgoing_message A buffer for outgoing messages to a vertex
+				 *                               program.
+				 * @param[in]         parameters Global algorithm parameters, currently an
+				 *                               instance of an empty struct (no
+				 *                               parameters).
+				 * @param[in,out]         pregel The Pregel state the program may refer to.
+				 *
+				 * This program 1) broadcasts its current ID to its neighbours, 2) checks
+				 * if any received IDs are larger than the current ID, then 3a) if not,
+				 * votes to halt; 3b) if yes, replaces the current ID with the received
+				 * maximum. It is meant to be executed using a max monoid as message
+				 * aggregator.
+				 */
+				static void program(
+					VertexIDType &current_max_ID,
+					const VertexIDType &incoming_message,
+					VertexIDType &outgoing_message,
+					const Data &parameters,
+					grb::interfaces::PregelState &pregel
+				) {
+					(void) parameters;
+					if( pregel.round > 0 ) {
+						if( pregel.indegree == 0 ) {
+							pregel.voteToHalt = true;
+						} else if( current_max_ID < incoming_message ) {
+							current_max_ID = incoming_message;
+						} else {
+							pregel.voteToHalt = true;
+						}
+					}
+					if( pregel.outdegree > 0 ) {
+						outgoing_message = current_max_ID;
+					} else {
+						pregel.voteToHalt = true;
+					}
+				}
+
+				/**
+				 * A convenience function that, given a Pregel instance, executes the
+				 * #program.
+				 *
+				 * @param[in,out] pregel A Pregel instance over which to execute the
+				 *                       program.
+				 * @param[out] group_ids The ID of the component the corresponding vertex
+				 *                       belongs to.
+				 * @param[in]  max_steps A maximum number of rounds the program is allowed
+				 *                       to run. If \a 0, no maximum number of rounds will
+				 *                       be in effect.
+				 *
+				 * On succesful termination, the number of rounds is optionally written
+				 * out:
+
+				 * @param[out] steps_taken A pointer to where the number of rounds should
+				 *                         be recorded. Will not be used if equal to
+				 *                         <tt>nullptr</tt>.
+				 */
+				template< typename PregelType >
+				static grb::RC execute(
+					grb::interfaces::Pregel< PregelType > &pregel,
+					grb::Vector< VertexIDType > &group_ids,
+					const size_t max_steps = 0,
+					size_t * const steps_taken = nullptr
+				) {
+					const size_t n = pregel.num_vertices();
+					if( grb::size( group_ids ) != n ) {
+						return MISMATCH;
+					}
+
+					grb::RC ret = grb::set< grb::descriptors::use_index >( group_ids, 1 );
+					if( ret != SUCCESS ) {
+						return ret;
+					}
+
+					grb::Vector< VertexIDType > in( n );
+					grb::Vector< VertexIDType > out( n );
+					grb::Vector< VertexIDType > out_buffer = interfaces::config::out_sparsify
+						? grb::Vector< VertexIDType >( n )
+						: grb::Vector< VertexIDType >( 0 );
+
+					size_t steps;
+
+					ret = pregel.template execute<
+						grb::operators::max< VertexIDType >,
+						grb::identities::negative_infinity
+					> (
+						program,
+						group_ids,
+						Data(),
+						in, out,
+						steps,
+						out_buffer,
+						max_steps
+					);
+
+					if( ret == grb::SUCCESS && steps_taken != nullptr ) {
+						*steps_taken = steps;
+					}
+
+					return ret;
+				}
+
+			};
+
+		} //end namespace `grb::algorithms::pregel'
+
+	} // end namespace ``grb::algorithms''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/algorithms/pregel_pagerank.hpp b/include/graphblas/algorithms/pregel_pagerank.hpp
new file mode 100644
index 000000000..5064f7f24
--- /dev/null
+++ b/include/graphblas/algorithms/pregel_pagerank.hpp
@@ -0,0 +1,224 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements a traditional vertex-centric page ranking algorithm using
+ * ALP/Pregel.
+ *
+ * @author A. N. Yzelman
+ */
+
+#ifndef _H_GRB_PREGEL_PAGERANK
+#define _H_GRB_PREGEL_PAGERANK
+
+#include <graphblas/interfaces/pregel.hpp>
+
+
+namespace grb {
+
+	namespace algorithms {
+
+		namespace pregel {
+
+			/**
+			 * A Pregel-style PageRank-like algorithm.
+			 *
+			 * This vertex-centric program does not correspond to the canonical PageRank
+			 * algorithm by Brin and Page. In particular, it misses corrections for
+			 * dangling nodes and does not perform convergence checks in any norm.
+			 *
+			 * @tparam IOType The type of the PageRank scores (e.g., <tt>double</tt>).
+			 * @tparam localConverge Whether vertices become inactive once their local
+			 *                       scores have converged, or whether to terminate only
+			 *                       when all vertices have converged.
+			 *
+			 * \ingroup Pregel
+			 */
+			template< typename IOType, bool localConverge >
+			struct PageRank {
+
+				/**
+				 * The algorithm parameters.
+				 */
+				struct Data {
+
+					/**
+					 * The probability of jumping to a random page instead of a linked page.
+					 */
+					IOType alpha = 0.15;
+
+					/**
+					 * The local convergence criterion.
+					 */
+					IOType tolerance = 0.00001;
+
+				};
+
+				/**
+				 * The vertex-centric PageRank-like program.
+				 *
+				 * @param[out] current_score    The current rank corresponding to this
+				 *                              vertex.
+				 * @param[in]  incoming_message Neighbour contributions to our score.
+				 * @param[out] outgoing_message The score contribution to send to our
+				 *                              neighbours.
+				 * @param[in]     parameters    The algorithm parameters.
+				 * @param[in,out]     pregel    The state of the Pregel interface.
+				 *
+				 * The Pregel program expects incoming messages to be aggregated using a
+				 * plus monoid over elements of \a IOType.
+				 */
+				static void program(
+					IOType &current_score,
+					const IOType &incoming_message,
+					IOType &outgoing_message,
+					const Data &parameters,
+					grb::interfaces::PregelState &pregel
+				) {
+					// initialise
+					if( pregel.round == 0 ) {
+						current_score = static_cast< IOType >( 1 );
+					}
+
+#ifdef _DEBUG
+					// when in debug mode, probably one does not wish to track the state of
+					// each vertex individually, hence we include a simple guard by default:
+					const bool dbg = pregel.vertexID == 0;
+					if( dbg ) {
+						std::cout << "ID: " << pregel.vertexID << "\n"
+							<< "\t active: " << pregel.active << "\n"
+							<< "\t round: " << pregel.round << "\n"
+							<< "\t previous score: " << current_score << "\n"
+							<< "\t incoming message: " << incoming_message << "\n";
+					}
+#endif
+
+					// compute
+					if( pregel.round > 0 ) {
+						const IOType old_score = current_score;
+						current_score = parameters.alpha +
+							(static_cast< IOType >(1) - parameters.alpha) * incoming_message;
+						if( fabs(current_score-old_score) < parameters.tolerance ) {
+#ifdef _DEBUG
+							std::cout << "\t\t vertex " << pregel.vertexID << " converged\n";
+#endif
+							if( localConverge ) {
+								pregel.active = false;
+							} else {
+								pregel.voteToHalt = true;
+							}
+						}
+					}
+
+					// broadcast
+					if( pregel.outdegree > 0 ) {
+						outgoing_message =
+							current_score /
+							static_cast< IOType >(pregel.outdegree);
+					}
+
+#ifdef _DEBUG
+					if( dbg ) {
+						std::cout << "\t current score: " << current_score << "\n"
+							<< "\t voteToHalt: " << pregel.voteToHalt << "\n"
+							<< "\t outgoing message: " << outgoing_message << "\n";
+					}
+#endif
+
+				}
+
+				/**
+				 * A convenience function for launching a PageRank algorithm over a given
+				 * Pregel instance.
+				 *
+				 * @tparam PregelType The nonzero type of an edge in the Pregel instance.
+				 *
+				 * This convenience function materialises the buffers expected to be passed
+				 * into the Pregel instance, and selects the expected monoid for executing
+				 * this program.
+				 *
+				 * \warning In performance-critical code, one may want to pre-allocate the
+				 *          buffers instead of having this convenience function allocate
+				 *          those. In such cases, please call manually the Pregel execute
+				 *          function, i.e., #grb::interfaces::Pregel< PregelType >::execute.
+				 *
+				 * The following arguments are mandatory:
+				 *
+				 * @param[in]  pregel      The Pregel instance that this program should
+				 *                         execute on.
+				 * @param[out] scores      A vector that corresponds to the scores
+				 *                         corresponding to each vertex. It must be of size
+				 *                         equal to the number of vertices \f$ n \f$ in the
+				 *                         \a pregel instance, and must have \f$ n \f$
+				 *                         capacity \em and values. The initial contents are
+				 *                         ignored by this algorithm.
+				 * @param[out] steps_taken How many rounds the program took until
+				 *                         termination.
+				 *
+				 * The following arguments are optional:
+				 *
+				 * @param[in] parameters The algorithm parameters. If not given, default
+				 *                       values will be substituted.
+				 * @param[in] max_steps  The maximum number of rounds this program may take.
+				 *                       If not given, the number of rounds will be
+				 *                       unlimited.
+				 */
+				template< typename PregelType >
+				static grb::RC execute(
+					grb::interfaces::Pregel< PregelType > &pregel,
+					grb::Vector< IOType > &scores,
+					size_t &steps_taken,
+					const Data &parameters = Data(),
+					const size_t max_steps = 0
+				) {
+					const size_t n = pregel.num_vertices();
+					if( grb::size( scores ) != n ) {
+						return MISMATCH;
+					}
+
+					grb::Vector< IOType > in( n );
+					grb::Vector< IOType > out( n );
+					grb::Vector< IOType > out_buffer = interfaces::config::out_sparsify
+						? grb::Vector< IOType >( n )
+						: grb::Vector< IOType >( 0 );
+
+					return pregel.template execute<
+							grb::operators::add< IOType >,
+							grb::identities::zero
+						> (
+							program,
+							scores,
+							parameters,
+							in, out,
+							steps_taken,
+							out_buffer,
+							max_steps
+						);
+				}
+
+			};
+
+		} //end namespace `grb::algorithms::pregel'
+
+	} // end namespace ``grb::algorithms''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/algorithms/simple_pagerank.hpp b/include/graphblas/algorithms/simple_pagerank.hpp
index c1b9243e8..268e088f3 100644
--- a/include/graphblas/algorithms/simple_pagerank.hpp
+++ b/include/graphblas/algorithms/simple_pagerank.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements the canonical PageRank algorithm by Brin and Page.
+ *
  * @author A. N. Yzelman
  * @date: 21st of March, 2017
  */
@@ -29,6 +33,7 @@
 #include <iostream>
 #endif
 
+
 namespace grb {
 
 	namespace algorithms {
@@ -86,7 +91,7 @@ namespace grb {
 		 * @param[out] iterations If not <tt>nullptr</tt>, the number of iterations
 		 *                        the call to this algorithm took will be written to
 		 *                        the location pointed to.
-		 * @param[out] quality    If not <tt>nullptr,/tt>, the last computed residual
+		 * @param[out] quality    If not <tt>nullptr</tt>, the last computed residual
 		 *                        will be written to the location pointed to.
 		 *
 		 * @returns #grb::SUCCESS  If the computation converged within \a max
diff --git a/include/graphblas/algorithms/sparse_nn_single_inference.hpp b/include/graphblas/algorithms/sparse_nn_single_inference.hpp
index 64d132982..ff9b11f31 100644
--- a/include/graphblas/algorithms/sparse_nn_single_inference.hpp
+++ b/include/graphblas/algorithms/sparse_nn_single_inference.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements (non-batched) sparse neural network inference.
+ *
  * @author Aristeidis Mastoras
  */
 
@@ -25,6 +29,7 @@
 #include <limits>
 #include <graphblas.hpp>
 
+
 namespace grb {
 
 	namespace algorithms {
@@ -216,8 +221,6 @@ namespace grb {
 		 * inference proceeds:
 		 *
 		 * @param[in] relu The non-linear ReLU function to apply element-wise.
-		 * @param[in] min  Operator used for thresholding. Maximum feature value
-		 *                 is hard-coded to 32, as per the GraphChallenge.
 		 * @param[in] ring The semiring under which to perform the inference.
 		 *
 		 * The default algebraic structures are standard \a relu (i.e., max), \a min
diff --git a/include/graphblas/algorithms/spy.hpp b/include/graphblas/algorithms/spy.hpp
index 6bc6d5939..413622780 100644
--- a/include/graphblas/algorithms/spy.hpp
+++ b/include/graphblas/algorithms/spy.hpp
@@ -15,11 +15,19 @@
  * limitations under the License.
  */
 
+/**
+ * @file
+ *
+ * Implements a simple matrix spy algorithm.
+ *
+ * @author A. N. Yzelman
+ */
 
 #ifndef _H_GRB_ALGORITHMS_SPY
 #define _H_GRB_ALGORITHMS_SPY
 
 #include <type_traits>
+#include <vector>
 
 #include <graphblas.hpp>
 
diff --git a/include/graphblas/backends.hpp b/include/graphblas/backends.hpp
index 72f1bcec1..3fd2f0ec1 100644
--- a/include/graphblas/backends.hpp
+++ b/include/graphblas/backends.hpp
@@ -16,140 +16,203 @@
  */
 
 /**
+ * @file
+ *
+ * This file contains a register of all backends that are either implemented,
+ * under implementation, or conceived and recorded for future consideration to
+ * implement.
+ *
  * @author: A. N. Yzelman
  * @date 21st of December, 2016
- *
- * @file This file contains a register of all backends that are either
- *       implemented, under implementation, or were at any point in time
- *       conceived and noteworthy enough to be recorded for future
- *       consideration to implement. It does so via the grb::Backend
- *       enum.
  */
 
 #ifndef _H_GRB_BACKENDS
 #define _H_GRB_BACKENDS
 
+
 namespace grb {
 
 	/**
-	 * This enum collects all implemented backends. Depending on compile flags,
-	 * some of these options may be disabled.
+	 * A collection of all backends. Depending on which dependences were
+	 * configured during the bootstrapping of this ALP installation, some of these
+	 * backends may be disabled.
+	 *
+	 * \internal
+	 * The collection includes backend identifiers that are for internal use only.
+	 * \endinternal
+	 *
+	 * \ingroup backends
 	 */
 	enum Backend {
 
 		/**
 		 * The sequential reference implementation. Supports fast operations with
-		 * both sparse and dense vectors.
+		 * both sparse and dense vectors, and employs auto-vectorisation.
 		 */
 		reference,
 
 		/**
 		 * The threaded reference implementation. Supports fast operations with both
-		 * sparse and dense vectors.
+		 * sparse and dense vectors. Employs OpenMP used with a mixture of fork/join
+		 * and SPMD programming styles.
 		 */
 		reference_omp,
 
 		/**
-		 * A shared-memory parallel distribution based on a row-wise 1D data
-		 * distribution using shared vector data.
+		 * A backend that automatically extracts hyperDAGs from user computations. It
+		 * only captures metadata for recording the hyperDAG, and relies on another
+		 * backend to actually execute the requested computations-- by default, this
+		 * is the #reference backend.
+		 */
+		hyperdags,
+
+		/**
+		 * The threaded nonblocking implementation. Supports fast operations with both
+		 * sparse and dense vectors. This backend is currently under development.
+		 */
+		nonblocking,
+
+		/**
+		 * \internal
+		 * A shared-memory parallel distribution based on a row-wise 1D block-cyclic
+		 * data distribution using shared vector data.
+		 * \endinternal
 		 */
 		shmem1D,
 
 		/**
+		 * \internal
 		 * Like shmem1D, but using interleaved vector allocation. Useful for multi-
 		 * socket single-node targets. From experience, this is a good choice for up
 		 * to four sockets-- after which BSP2D becomes preferred.
+		 * \endinternal
 		 */
 		NUMA1D,
 
 		/**
-		 * A superclass of all BSP-based implementations.
+		 * \internal
+		 * A superclass of all LPF-based implementations. Not a "real" (selectable)
+		 * backend.
+		 * \endinternal
 		 */
 		GENERIC_BSP,
 
 		/**
 		 * A parallel implementation based on a row-wise 1D data distribution,
-		 * implemented using PlatformBSP.
+		 * implemented using LPF.
+		 *
+		 * This backend manages multiple user processes, manages data distributions
+		 * of containers between those user processes, and decomposes primitives into
+		 * local compute phases with intermittent communications. For local compute
+		 * phases it composes with a single user process backend, #reference by
+		 * default.
 		 */
 		BSP1D,
 
 		/**
+		 * \internal
 		 * Like BSP1D, but stores each matrix twice. Combined with the normal
 		 * reference implementation, this actually stores all matrices four times
 		 * This implementation is useful for maximum performance, at the cost of
 		 * the additional memory usage.
+		 * \endinternal
 		 */
 		doublyBSP1D,
 
 		/**
+		 * \internal
 		 * A parallel implementation based on a block-cyclic 2D data distribution,
 		 * implemented using PlatformBSP. This implementation will likely outperform
 		 * BSP1D and doublyBSP1D as the number of nodes involved in the computation
 		 * increases with the problem sizes.
+		 * \endinternal
 		 */
 		BSP2D,
 
 		/**
+		 * \internal
 		 * Like BSP2D, but automatically improves the distribution while executing
 		 * user code-- while initial computations are slowed down, the user
 		 * application will speed up as this GraphBLAS implementation infers more
 		 * information about the best data distribution.
 		 * When enough statistics are gathered, data is redistributed and all future
 		 * operations execute much faster than with BSP2D alone.
+		 * \endinternal
 		 */
 		autoBSP,
 
 		/**
+		 * \internal
 		 * Like autoBSP, except that the best distribution is precomputed whenever a
 		 * matrix is read in. This pre-processing step is very expensive. Use autoBSP
 		 * when unsure if the costs of a full preprocessing stage is worth it.
+		 * \endinternal
 		 */
 		optBSP,
 
 		/**
-		 * A hybrid that uses shmem1D within each socket and BSP1D between sockets.
+		 * A composed backend that uses #reference_omp within each user process and
+		 * #BSP1D between sockets.
+		 *
+		 * This backend is implemented using the #BSP1D code, with the process-local
+		 * backend overridden from #reference to #reference_omp.
+		 */
+		hybrid,
+
+		/**
+		 * \internal
+		 * A hybrid that uses #shmem1D within each socket and #BSP1D between sockets.
 		 * Recommended for a limited number of sockets and a limited amount of nodes,
 		 * i.e., for a small cluster.
+		 * \endinternal
 		 */
 		hybridSmall,
 
 		/**
-		 * A hybrid that uses numa1D within each socket and BSP1D between sockets.
+		 * \internal
+		 * A hybrid that uses #numa1D within each socket and #BSP1D between sockets.
 		 * Recommended for a limited number of nodes with up to two sockets each.
 		 *
-		 * This variant is expected to perform better than hybrid1D for middle-sized
-		 * clusters.
+		 * This variant is expected to perform better than #hybridSmall for
+		 * middle-sized clusters.
+		 * \endinternal
 		 */
 		hybridMid,
 
 		/**
-		 * A hybrid that uses numa1D within each socket and autoBSP between sockets.
+		 * \internal
+		 * A hybrid that uses #numa1D within each socket and #autoBSP between sockets.
 		 * Recommended for a large number of nodes with up to two sockets each.
 		 *
-		 * This variant is expected to perform better than hybridSmall and hybridMid
+		 * This variant is expected to perform better than #hybridSmall and #hybridMid
 		 * for larger clusters.
 		 *
 		 * If there are many nodes each with many sockets (four or more) each, then
 		 * the use of flat (non-hybrid) #BSP2D or #autoBSP is recommended instead.
+		 * \endinternal
 		 */
 		hybridLarge,
 
 		/**
+		 * \internal
 		 * A hybrid variant that is optimised for a minimal memory footprint.
+		 * \endinternal
 		 */
 		minFootprint,
 
 		/**
-		 * A variant for RISC-V processors.
-		 *
-		 * Collaboration with ETH Zurich (ongoing).
+		 * A variant for Snitch RISC-V cores. It is based on an older #reference
+		 * backend.
 		 */
 		banshee,
 
 		/**
-		 * A variant for RISC-V processors with (I)SSR extensions
+		 * \internal
+		 * A variant for RISC-V processors with (I)SSR extensions.
 		 *
-		 * Collaboration with ETH Zurich (ongoing).
+		 * \note This backend is used internally by the #banshee backend; it is not
+		 *       selectable.
+		 * \endinternal
 		 */
 		banshee_ssr
 
@@ -158,3 +221,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/banshee/config.hpp b/include/graphblas/banshee/config.hpp
index b70b0b9d2..c4a9a8baf 100644
--- a/include/graphblas/banshee/config.hpp
+++ b/include/graphblas/banshee/config.hpp
@@ -29,6 +29,7 @@
 
 #include <graphblas/config.hpp>
 
+
 namespace grb {
 
 	/**
@@ -46,3 +47,4 @@ namespace grb {
 } // namespace grb
 
 #endif // end ``_H_GRB_BANSHEE_CONFIG''
+
diff --git a/include/graphblas/base/benchmark.hpp b/include/graphblas/base/benchmark.hpp
index 74c666ec7..56a2fade6 100644
--- a/include/graphblas/base/benchmark.hpp
+++ b/include/graphblas/base/benchmark.hpp
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * This file contains a variant on the #grb::Launcher specialised for
+ * benchmarks.
+ *
  * @author J. W. Nash & A. N. Yzelman
  * @date 17th of April, 2017
  */
@@ -39,298 +44,557 @@
 #include "exec.hpp"
 
 #ifndef _GRB_NO_STDIO
-#include <iostream>
+ #include <iostream>
 #endif
 
 #ifndef _GRB_NO_EXCEPTIONS
-#include <stdexcept>
+ #include <stdexcept>
 #endif
 
 #include <math.h>
 
+
+/**
+ * \defgroup benchmarking Benchmarking
+ *
+ * ALP has a specialised class for benchmarking ALP programs, grb::Benchmarker,
+ * which is a variant on the #grb::Launcher. It codes a particular benchmarking
+ * strategy of any given ALP program as described below.
+ *
+ * The program is called \a inner times \a outer times. Between every
+ * \a inner repetitions there is a one-second sleep that ensures machine
+ * variability is taken into account. Several statistics are measured
+ * across the \a outer repetitions: the minimum, maximum, average, and the
+ * (unbiased) sample standard deviation. By contrast, for the \a inner
+ * repetitions, only an average is computed -- the function of \a inner
+ * repetitions is solely to avoid timing programs that execute in too short
+ * a time frame, meaning a time frame that is of a similar order as the time
+ * it takes to actually call the system timer functionalities.
+ *
+ * \note As a result, \a inner should always equal \em one when benchmarking
+ *       any non-trivial ALP program, while for benchmarking ALP kernels on
+ *       small data \a inner may be taken (much) larger.
+ *
+ * \note In published experiments, \a inner is chosen such that a single
+ *       outer repetition takes 10 to 100 milliseconds.
+ */
+
 namespace grb {
 
 	namespace internal {
 
+		/**
+		 * The common functionalities used by all #grb::Benchmarker classes.
+		 *
+		 * \ingroup benchmarking
+		 */
 		class BenchmarkerBase {
 
-		protected:
+			protected:
+
 #ifndef _GRB_NO_STDIO
-			/** \todo TODO add documentation. */
-			static void printTimeSinceEpoch( const bool printHeader = true ) {
-				const auto now = std::chrono::system_clock::now();
-				const auto since = now.time_since_epoch();
-				if( printHeader ) {
-					std::cout << "Time since epoch (in ms.): ";
+				/**
+				 * A helper function that prints the time elapsed sinc epoch.
+				 *
+				 * @param[in] printHeader An optional Boolean parameter with default value
+				 *                        <tt>true</tt>. If set, this function will append
+				 *                        a human-readable header before outputting the
+				 *                        time-since-epoch.
+				 */
+				static void printTimeSinceEpoch( const bool printHeader = true ) {
+					const auto now = std::chrono::system_clock::now();
+					const auto since = now.time_since_epoch();
+					if( printHeader ) {
+						std::cout << "Time since epoch (in ms.): ";
+					}
+					std::cout << std::chrono::duration_cast<
+							std::chrono::milliseconds
+						>( since ).count() << "\n";
 				}
-				std::cout << std::chrono::duration_cast< std::chrono::milliseconds >( since ).count() << "\n";
-			}
 #endif
 
-			// calculate inner loop performance stats
-			static void benchmark_calc_inner( const size_t loop,
-				const size_t total,
-				grb::utils::TimerResults & inner_times,
-				grb::utils::TimerResults & total_times,
-				grb::utils::TimerResults & min_times,
-				grb::utils::TimerResults & max_times,
-				grb::utils::TimerResults * sdev_times ) {
-				inner_times.normalize( total );
-				total_times.accum( inner_times );
-				min_times.min( inner_times );
-				max_times.max( inner_times );
-				sdev_times[ loop ] = inner_times;
-			}
-
-			// calculate outer loop performance stats
-			static void benchmark_calc_outer( const size_t total,
-				grb::utils::TimerResults & total_times,
-				grb::utils::TimerResults & min_times,
-				grb::utils::TimerResults & max_times,
-				grb::utils::TimerResults * sdev_times,
-				const size_t pid ) {
-				total_times.normalize( total );
-				grb::utils::TimerResults sdev;
-				// compute standard dev of average times, leaving sqrt calculation until the output of the values
-				sdev.set( 0 );
-				for( size_t i = 0; i < total; i++ ) {
-					double diff = sdev_times[ i ].io - total_times.io;
-					sdev.io += diff * diff;
-					diff = sdev_times[ i ].preamble - total_times.preamble;
-					sdev.preamble += diff * diff;
-					diff = sdev_times[ i ].useful - total_times.useful;
-					sdev.useful += diff * diff;
-					diff = sdev_times[ i ].postamble - total_times.postamble;
-					sdev.postamble += diff * diff;
+				/**
+				 * Calculate inner loop performance stats
+				 */
+				static void benchmark_calc_inner(
+					const size_t loop,
+					const size_t total,
+					grb::utils::TimerResults &inner_times,
+					grb::utils::TimerResults &total_times,
+					grb::utils::TimerResults &min_times,
+					grb::utils::TimerResults &max_times,
+					grb::utils::TimerResults * sdev_times
+				) {
+					inner_times.normalize( total );
+					total_times.accum( inner_times );
+					min_times.min( inner_times );
+					max_times.max( inner_times );
+					sdev_times[ loop ] = inner_times;
 				}
-				// unbiased normalisation of the standard deviation
-				sdev.normalize( total - 1 );
+
+				/**
+				 * Calculate outer loop performance stats
+				 */
+				static void benchmark_calc_outer(
+					const size_t total,
+					grb::utils::TimerResults &total_times,
+					grb::utils::TimerResults &min_times,
+					grb::utils::TimerResults &max_times,
+					grb::utils::TimerResults * sdev_times,
+					const size_t pid
+				) {
+					total_times.normalize( total );
+					grb::utils::TimerResults sdev;
+					// compute standard dev of average times, leaving sqrt calculation until
+					// the output of the values
+					sdev.set( 0 );
+					for( size_t i = 0; i < total; i++ ) {
+						double diff = sdev_times[ i ].io - total_times.io;
+						sdev.io += diff * diff;
+						diff = sdev_times[ i ].preamble - total_times.preamble;
+						sdev.preamble += diff * diff;
+						diff = sdev_times[ i ].useful - total_times.useful;
+						sdev.useful += diff * diff;
+						diff = sdev_times[ i ].postamble - total_times.postamble;
+						sdev.postamble += diff * diff;
+					}
+					// unbiased normalisation of the standard deviation
+					sdev.normalize( total - 1 );
 
 #ifndef _GRB_NO_STDIO
-				// output results
-				if( pid == 0 ) {
-					std::cout << "Overall timings (io, preamble, useful, "
-								 "postamble):\n"
-							  << std::scientific;
-					std::cout << "Avg: " << total_times.io << ", " << total_times.preamble << ", " << total_times.useful << ", " << total_times.postamble << "\n";
-					std::cout << "Min: " << min_times.io << ", " << min_times.preamble << ", " << min_times.useful << ", " << min_times.postamble << "\n";
-					std::cout << "Max: " << max_times.io << ", " << max_times.preamble << ", " << max_times.useful << ", " << max_times.postamble << "\n";
-					std::cout << "Std: " << sqrt( sdev.io ) << ", " << sqrt( sdev.preamble ) << ", " << sqrt( sdev.useful ) << ", " << sqrt( sdev.postamble ) << "\n";
-#if __GNUC__ > 4
-					std::cout << std::defaultfloat;
-#endif
-					printTimeSinceEpoch();
-				}
+					// output results
+					if( pid == 0 ) {
+						std::cout << "Overall timings (io, preamble, useful, postamble):\n"
+							<< std::scientific;
+						std::cout << "Avg: " << total_times.io << ", " << total_times.preamble
+							<< ", " << total_times.useful << ", " << total_times.postamble << "\n";
+						std::cout << "Min: " << min_times.io << ", " << min_times.preamble << ", "
+							<< min_times.useful << ", " << min_times.postamble << "\n";
+						std::cout << "Max: " << max_times.io << ", " << max_times.preamble << ", "
+							<< max_times.useful << ", " << max_times.postamble << "\n";
+						std::cout << "Std: " << sqrt( sdev.io ) << ", " << sqrt( sdev.preamble )
+							<< ", " << sqrt( sdev.useful ) << ", " << sqrt( sdev.postamble ) << "\n";
+ #if __GNUC__ > 4
+						std::cout << std::defaultfloat;
+ #endif
+						printTimeSinceEpoch();
+					}
 #else
-				// write to file(?)
-				(void)min_times;
-				(void)max_times;
-				(void)pid;
+					// we ran the benchmark, but may not have a way to output it in this case
+					// this currently only is touched by the #grb::banshee backend, which
+					// provides other timing mechanisms.
+					(void) min_times;
+					(void) max_times;
+					(void) pid;
 #endif
-			}
-
-			template< typename U, enum Backend implementation = config::default_backend >
-			static RC benchmark( void ( *grb_program )( const void *,
-									 const size_t,
-									 U & ), // user GraphBLAS program
-				const void * data_in,
-				const size_t in_size,
-				U & data_out, // input & output data
-				const size_t inner,
-				const size_t outer,
-				const size_t pid ) {
-				const double inf = std::numeric_limits< double >::infinity();
-				grb::utils::TimerResults total_times, min_times, max_times;
-				grb::utils::TimerResults * sdev_times = new grb::utils::TimerResults[ outer ];
-				total_times.set( 0 );
-				min_times.set( inf );
-				max_times.set( 0 );
-
-				// outer loop
-				for( size_t out = 0; out < outer; out++ ) {
-					grb::utils::TimerResults inner_times;
-					inner_times.set( 0 );
-
-					// inner loop
-					for( size_t in = 0; in < inner; in++ ) {
-						data_out.times.set( 0 );
-						( *grb_program )( data_in, in_size, data_out );
-						grb::collectives< implementation >::reduce( data_out.times.io, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.preamble, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.useful, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.postamble, 0, grb::operators::max< double >() );
-						inner_times.accum( data_out.times );
-					}
+				}
 
-					// calculate performance stats
-					benchmark_calc_inner( out, inner, inner_times, total_times, min_times, max_times, sdev_times );
+				/**
+				 * Benchmarks a given ALP program.
+				 *
+				 * This variant applies to input data as a byte blob and output data as a
+				 * user-defined POD struct.
+				 *
+				 * @tparam U       Output type of the given user program.
+				 * @tparam backend Which backend the program is using.
+				 *
+				 * @param[in]  alp_program The use rogram to be benchmarked
+				 * @param[in]  data_in     Input data as a raw data blob
+				 * @param[in]  in_size     The size, in bytes, of the input data
+				 * @param[out] out_data    Output data
+				 * @param[in]  inner       The number of inner repetitions of the benchmark
+				 * @param[in]  outer       The number of outer repetitions of the benchmark
+				 * @param[in]  pid         Unique ID of the calling user process
+				 *
+				 * @see benchmarking
+				 *
+				 * @ingroup benchmarking
+				 */
+				template<
+					typename U,
+					enum Backend implementation = config::default_backend
+				>
+				static RC benchmark(
+					void ( *alp_program )( const void *, const size_t, U & ),
+					const void * data_in,
+					const size_t in_size,
+					U &data_out,
+					const size_t inner,
+					const size_t outer,
+					const size_t pid
+				) {
+					const double inf = std::numeric_limits< double >::infinity();
+					grb::utils::TimerResults total_times, min_times, max_times;
+					grb::utils::TimerResults * sdev_times =
+						new grb::utils::TimerResults[ outer ];
+					total_times.set( 0 );
+					min_times.set( inf );
+					max_times.set( 0 );
+
+					// outer loop
+					for( size_t out = 0; out < outer; ++out ) {
+						grb::utils::TimerResults inner_times;
+						inner_times.set( 0 );
+
+						// inner loop
+						for( size_t in = 0; in < inner; in++ ) {
+							data_out.times.set( 0 );
+							( *alp_program )( data_in, in_size, data_out );
+							grb::collectives< implementation >::reduce(
+								data_out.times.io, 0, grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce(
+								data_out.times.preamble, 0, grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce(
+								data_out.times.useful, 0, grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce(
+								data_out.times.postamble, 0, grb::operators::max< double >() );
+							inner_times.accum( data_out.times );
+						}
+
+						// calculate performance stats
+						benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
+							max_times, sdev_times );
 
 #ifndef _GRB_NO_STDIO
-					// give experiment output line
-					if( pid == 0 ) {
-						std::cout << "Outer iteration #" << out
-								  << " timings (io, preamble, useful, "
-									 "postamble, time since epoch): ";
-						std::cout << inner_times.io << ", " << inner_times.preamble << ", " << inner_times.useful << ", " << inner_times.postamble << ", ";
-						printTimeSinceEpoch( false );
-					}
+						// give experiment output line
+						if( pid == 0 ) {
+							std::cout << "Outer iteration #" << out << " timings (io, preamble, "
+								<< "useful, postamble, time since epoch): ";
+							std::cout << inner_times.io << ", " << inner_times.preamble << ", "
+								<< inner_times.useful << ", " << inner_times.postamble << ", ";
+							printTimeSinceEpoch( false );
+						}
 #endif
 
-					// pause for next outer loop
-					if( sleep( 1 ) != 0 ) {
+						// pause for next outer loop
+						if( sleep( 1 ) != 0 ) {
 #ifndef _GRB_NO_STDIO
-						std::cerr << "Sleep interrupted, assume benchmark is "
-									 "unreliable and exiting.\n";
+							std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
+								<< "exiting.\n";
 #endif
-						abort();
+							abort();
+						}
 					}
-				}
 
-				// calculate performance stats
-				benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, pid );
-				delete[] sdev_times;
+					// calculate performance stats
+					benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
+						pid );
+					delete [] sdev_times;
 
-				return SUCCESS;
-			}
+					return SUCCESS;
+				}
 
-			template< typename T, typename U, enum Backend implementation = config::default_backend >
-			static RC benchmark( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T & data_in,
-				U & data_out, // input & output data
-				const size_t inner,
-				const size_t outer,
-				const size_t pid ) {
-				const double inf = std::numeric_limits< double >::infinity();
-				grb::utils::TimerResults total_times, min_times, max_times;
-				grb::utils::TimerResults * sdev_times = new grb::utils::TimerResults[ outer ];
-				total_times.set( 0 );
-				min_times.set( inf );
-				max_times.set( 0 );
-
-				// outer loop
-				for( size_t out = 0; out < outer; out++ ) {
-					grb::utils::TimerResults inner_times;
-					inner_times.set( 0 );
-
-					// inner loop
-					for( size_t in = 0; in < inner; in++ ) {
-						data_out.times.set( 0 );
-
-						( *grb_program )( data_in, data_out );
-						grb::collectives< implementation >::reduce( data_out.times.io, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.preamble, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.useful, 0, grb::operators::max< double >() );
-						grb::collectives< implementation >::reduce( data_out.times.postamble, 0, grb::operators::max< double >() );
-						inner_times.accum( data_out.times );
-					}
-
-					// calculate performance stats
-					benchmark_calc_inner( out, inner, inner_times, total_times, min_times, max_times, sdev_times );
+				/**
+				 * Benchmarks a given ALP program.
+				 *
+				 * This variant applies to input data as a user-defined POD struct and
+				 * output data as a user-defined POD struct.
+				 *
+				 * @tparam T Input type of the given user program.
+				 * @tparam U Output type of the given user program.
+				 *
+				 * @param[in]  alp_program The use rogram to be benchmarked
+				 * @param[in]  data_in     Input data as a raw data blob
+				 * @param[in]  in_size     The size, in bytes, of the input data
+				 * @param[out] out_data    Output data
+				 * @param[in]  inner       The number of inner repetitions of the benchmark
+				 * @param[in]  outer       The number of outer repetitions of the benchmark
+				 * @param[in]  pid         Unique ID of the calling user process
+				 *
+				 * @see benchmarking
+				 *
+				 * @ingroup benchmarking
+				 */
+				template<
+					typename T, typename U,
+					enum Backend implementation = config::default_backend
+				>
+				static RC benchmark(
+					void ( *alp_program )( const T &, U & ),
+					const T &data_in,
+					U &data_out,
+					const size_t inner,
+					const size_t outer,
+					const size_t pid
+				) {
+					const double inf = std::numeric_limits< double >::infinity();
+					grb::utils::TimerResults total_times, min_times, max_times;
+					grb::utils::TimerResults * sdev_times =
+						new grb::utils::TimerResults[ outer ];
+					total_times.set( 0 );
+					min_times.set( inf );
+					max_times.set( 0 );
+
+					// outer loop
+					for( size_t out = 0; out < outer; ++out ) {
+						grb::utils::TimerResults inner_times;
+						inner_times.set( 0 );
+
+						// inner loop
+						for( size_t in = 0; in < inner; ++in ) {
+							data_out.times.set( 0 );
+
+							( *alp_program )( data_in, data_out );
+							grb::collectives< implementation >::reduce( data_out.times.io, 0,
+								grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce( data_out.times.preamble, 0,
+								grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce( data_out.times.useful, 0,
+								grb::operators::max< double >() );
+							grb::collectives< implementation >::reduce( data_out.times.postamble, 0,
+								grb::operators::max< double >() );
+							inner_times.accum( data_out.times );
+						}
+
+						// calculate performance stats
+						benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
+							max_times, sdev_times );
 
 #ifndef _GRB_NO_STDIO
-					// give experiment output line
-					if( pid == 0 ) {
-						std::cout << "Outer iteration #" << out
-								  << " timings (io, preamble, useful, "
-									 "postamble, time since epoch): "
-								  << std::fixed;
-						std::cout << inner_times.io << ", " << inner_times.preamble << ", " << inner_times.useful << ", " << inner_times.postamble << ", ";
-						printTimeSinceEpoch( false );
-						std::cout << std::scientific;
-					}
+						// give experiment output line
+						if( pid == 0 ) {
+							std::cout << "Outer iteration #" << out << " timings "
+								<< "(io, preamble, useful, postamble, time since epoch): " << std::fixed
+								<< inner_times.io << ", " << inner_times.preamble << ", "
+								<< inner_times.useful << ", " << inner_times.postamble << ", ";
+								printTimeSinceEpoch( false );
+							std::cout << std::scientific;
+						}
 #endif
 
-					// pause for next outer loop
-					if( sleep( 1 ) != 0 ) {
+						// pause for next outer loop
+						if( sleep( 1 ) != 0 ) {
 #ifndef _GRB_NO_STDIO
-						std::cerr << "Sleep interrupted, assume benchmark is "
-									 "unreliable and exiting.\n";
+							std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
+								<< "exiting.\n";
 #endif
-						abort();
+							abort();
+						}
 					}
+
+					// calculate performance stats
+					benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
+						pid );
+					delete[] sdev_times;
+
+					return SUCCESS;
 				}
 
-				// calculate performance stats
-				benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, pid );
-				delete[] sdev_times;
 
-				return SUCCESS;
-			}
+			public:
 
-		public:
-			BenchmarkerBase() {
+				BenchmarkerBase() {
 #ifndef _GRB_NO_STDIO
-				printTimeSinceEpoch();
+					printTimeSinceEpoch();
 #endif
-			}
+				}
+
 		};
 
 	} // namespace internal
 
 	/**
-	 * Benchmarking function, called from an exec function.
-	 * Takes the grbProgram and its input and output data and accumultes times
-	 * given in the output structure.
+	 * A class that follows the API of the #grb::Launcher, but instead of launching
+	 * the given ALP program once, it launches it multiple times while benchmarking
+	 * its execution times.
+	 *
+	 * @ingroup benchmarking
+	 * @see benchmarking
 	 */
 	template< enum EXEC_MODE mode, enum Backend implementation >
 	class Benchmarker {
 
 		public :
 
-			Benchmarker( size_t process_id = 0,     // user process ID
-				size_t nprocs = 1,                  // total number of user processes
-				std::string hostname = "localhost", // one of the user process hostnames
-				std::string port = "0"              // a free port at hostname
-			) { (void)process_id; (void)nprocs; (void)hostname; (void)port;
+			/**
+			 * Constructs an instance of the benchmarker class.
+			 *
+			 * @param[in] process_id A unique ID for the calling user process.
+			 * @param[in] nprocs     The total number of user processes participating in
+			 *                       the benchmark. The given \a process_id must be
+			 *                       strictly smaller than this given value.
+			 * @param[in] hostname   The hostname where one of the user processes
+			 *                       participating in the benchmark resides.
+			 * @param[in] port       A free TCP/IP port at the host corresponding to
+			 *                       the given \a hostname.
+			 *
+			 * The \a hostname and \a port arguments are unused if \a nprocs equals one.
+			 *
+			 * All arguments are optional-- their defaults are:
+			 *  - 0 for \a process_id,
+			 *  - 1 for \a nprocs,
+			 *  - \em localhost for \a hostname, and
+			 *  - 0 for \a port.
+			 *
+			 * This constructor may throw the same errors as #grb::Launcher.
+			 *
+			 * @see #grb::Launcher
+			 * @see benchmarking
+			 *
+			 * \internal This is the base class which should be overridden by given
+			 *           backend implementations.
+			 */
+			Benchmarker(
+				const size_t process_id = 0,
+				size_t nprocs = 1,
+				std::string hostname = "localhost",
+				std::string port = "0"
+			) {
+				(void)process_id; (void)nprocs; (void)hostname; (void)port;
 #ifndef _GRB_NO_EXCEPTIONS
-				throw std::logic_error( "Benchmarker class called with unsupported "
-										"mode or implementation" );
+				throw std::logic_error( "Benchmarker class called with unsupported mode or "
+					"implementation" );
 #endif
 			}
 
-	template< typename T, typename U >
-	RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-		const T & data_in,
-		U & data_out, // input & output data
-		const size_t inner,
-		const size_t outer,
-		const bool broadcast = false ) const {
-		(void)grb_program;
-		(void)data_in;
-		(void)data_out;
-		(void)inner;
-		(void)outer;
-		(void)broadcast;
-		// stub implementation, should be overridden by specialised implementation,
-		// so return error code
-		return PANIC;
-	}
-
-	template< typename U >
-	RC exec( void ( *grb_program )( const void *, const size_t, U & ), const void * data_in, const size_t in_size, U & data_out, const size_t inner, const size_t outer, const bool broadcast = false )
-		const {
-		(void)grb_program;
-		(void)data_in;
-		(void)in_size;
-		(void)data_out;
-		(void)inner;
-		(void)outer;
-		(void)broadcast;
-		return PANIC;
-	}
+			/**
+			 * Benchmarks a given ALP program.
+			 *
+			 * This variant applies to input data as a user-defined POD struct and
+			 * output data as a user-defined POD struct.
+			 *
+			 * @tparam T Input type of the given user program.
+			 * @tparam U Output type of the given user program.
+			 *
+			 * @param[in]  alp_program The ALP program to be benchmarked
+			 * @param[in]  data_in     Input data as a raw data blob
+			 * @param[out] data_out    Output data
+			 * @param[in]  inner       The number of inner repetitions of the benchmark
+			 * @param[in]  outer       The number of outer repetitions of the benchmark
+			 * @param[in]  broadcast   An optional argument that dictates whether the
+			 *                         \a data_in argument should be broadcast across all
+			 *                         user processes participating in the benchmark,
+			 *                         prior to \em each invocation of \a alp_program.
+			 *
+			 * The default value of \a broadcast is <tt>false</tt>.
+			 *
+			 * @returns #grb::SUCCESS The benchmarking has completed successfully.
+			 * @returns #grb::FAILED  An error during benchmarking has occurred. The
+			 *                        benchmark attempt could be retried, and an error
+			 *                        for the failure is reported to the standard error
+			 *                        stream.
+			 * @returns #grb::PANIC   If an unrecoverable error was encountered while
+			 *                        starting the benchmark, while benchmarking, or
+			 *                        while aggregating the final results.
+			 *
+			 * @see benchmarking
+			 *
+			 * \internal This is the base implementation that should be specialised by
+			 *           each backend separately.
+			 */
+			template< typename T, typename U >
+			RC exec(
+				void ( *alp_program )( const T &, U & ),
+				const T &data_in,
+				U &data_out,
+				const size_t inner,
+				const size_t outer,
+				const bool broadcast = false
+			) const {
+				(void) alp_program;
+				(void) data_in;
+				(void) data_out;
+				(void) inner;
+				(void) outer;
+				(void) broadcast;
+
+				// stub implementation, should be overridden by specialised implementation.
+				// furthermore, it should be impossible to call this function without
+				// triggering an exception during construction of this stub class, so we
+				// just return PANIC here
+				return PANIC;
+			}
 
-	/**
-	 * Releases all GraphBLAS resources. After a call to this function, no
-	 * GraphBLAS library functions may be called any longer.
-	 *
-	 * @return SUCCESS A call to this function may never fail.
-	 */
-	static RC finalize() {
-		return Launcher< mode, implementation >::finalize();
-	}
+			/**
+			 * Benchmarks a given ALP program.
+			 *
+			 * This variant applies to input data as a byte blob and output data as a
+			 * user-defined POD struct.
+			 *
+			 * @tparam U Output type of the given user program.
+			 *
+			 * @param[in]  alp_program The use rogram to be benchmarked
+			 * @param[in]  data_in     Input data as a raw data blob
+			 * @param[in]  in_size     The size, in bytes, of the input data
+			 * @param[out] data_out    Output data
+			 * @param[in]  inner       The number of inner repetitions of the benchmark
+			 * @param[in]  outer       The number of outer repetitions of the benchmark
+			 * @param[in]  broadcast   An optional argument that dictates whether the
+			 *                         \a data_in argument should be broadcast across all
+			 *                         user processes participating in the benchmark,
+			 *                         prior to \em each invocation of \a alp_program.
+			 *
+			 * The default value of \a broadcast is <tt>false</tt>.
+			 *
+			 * @returns #grb::SUCCESS The benchmarking has completed successfully.
+			 * @returns #grb::ILLEGAL  If \a in_size is nonzero but \a data_in compares
+			 *                        equal to <tt>nullptr</tt>.
+			 * @returns #grb::FAILED  An error during benchmarking has occurred. The
+			 *                        benchmark attempt could be retried, and an error
+			 *                        for the failure is reported to the standard error
+			 *                        stream.
+			 * @returns #grb::PANIC   If an unrecoverable error was encountered while
+			 *                        starting the benchmark, while benchmarking, or
+			 *                        while aggregating the final results.
+			 *
+			 * @see benchmarking
+			 *
+			 * \internal This is the base implementation that should be specialised by
+			 *           each backend separately.
+			 */
+			template< typename U >
+			RC exec(
+				void ( *alp_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out,
+				const size_t inner, const size_t outer,
+				const bool broadcast = false
+			) const {
+				(void) alp_program;
+				(void) data_in;
+				(void) in_size;
+				(void) data_out;
+				(void) inner;
+				(void) outer;
+				(void) broadcast;
+
+				// stub implementation, should be overridden by specialised implementation.
+				// furthermore, it should be impossible to call this function without
+				// triggering an exception during construction of this stub class, so we
+				// just return PANIC here
+				return PANIC;
+			}
 
-}; // namespace grb
+			/**
+			 * Releases all ALP resources.
+			 *
+			 * Calling this function is equivalent to calling #grb::Launcher::finalize.
+			 *
+			 * After a call to this function, no further ALP programs may be benchmarked
+			 * nor launched-- i.e., both the #grb::Launcher and #grb::Benchmarker
+			 * functionalities many no longer be used.
+			 *
+			 * A well-behaving program calls this function, or #grb::Launcher::finalize,
+			 * exactly once and just before exiting (or just before the guaranteed last
+			 * invocation of an ALP program).
+			 *
+			 * @return #grb::SUCCESS The resources have successfully and permanently been
+			 *                       released.
+			 * @return #grb::PANIC   An unrecoverable error has been encountered and the
+			 *                       user program is encouraged to exit as quickly as
+			 *                       possible. The state of the ALP library has become
+			 *                       undefined and should no longer be used.
+			 *
+			 * \internal This is the base implementation that should be specialised by
+			 *           each backend separately.
+			 */
+			static RC finalize() {
+				return Launcher< mode, implementation >::finalize();
+			}
+
+	};
 
 } // end namespace ``grb''
 
 #endif // end _H_GRB_BENCH_BASE
+
diff --git a/include/graphblas/base/blas1.hpp b/include/graphblas/base/blas1.hpp
index e3d4649af..9d451c1f6 100644
--- a/include/graphblas/base/blas1.hpp
+++ b/include/graphblas/base/blas1.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-1 API
+ *
  * @author A. N. Yzelman
  * @date 5th of December 2016
  */
@@ -38,160 +42,3573 @@
 namespace grb {
 
 	/**
-	 * \defgroup BLAS1 The Level-1 ALP/GraphBLAS routines
-	 *
-	 * A collection of functions that allow ALP/GraphBLAS operators, monoids, and
-	 * semirings work on a mix of zero-dimensional and one-dimensional containers;
-	 * i.e., allows various linear algebra operations on scalars and objects of
-	 * type #grb::Vector.
+	 * \defgroup BLAS1 Level-1 Primitives
+	 * \ingroup GraphBLAS
+	 *
+	 * A collection of functions that allow ALP/GraphBLAS operators, monoids, and
+	 * semirings work on a mix of zero-dimensional and one-dimensional containers;
+	 * i.e., allows various linear algebra operations on scalars and objects of
+	 * type #grb::Vector.
+	 *
+	 * All functions return an error code of the enum-type #grb::RC.
+	 *
+	 * Primitives which produce vector output:
+	 *   -# #grb::set (three variants);
+	 *   -# #grb::foldr (in-place reduction to the right, scalar-to-vector and
+	 *      vector-to-vector);
+	 *   -# #grb::foldl (in-place reduction to the left, scalar-to-vector and
+	 *      vector-to-vector);
+	 *   -# #grb::eWiseApply (out-of-place application of a binary function);
+	 *   -# #grb::eWiseAdd (in-place addition of two vectors, a vector and a
+	 *      scalar, into a vector); and
+	 *   -# #grb::eWiseMul (in-place multiplication of two vectors, a vector and a
+	 *      scalar, into a vector).
+	 *
+	 * \note When #grb::eWiseAdd or #grb::eWiseMul using two input scalars is
+	 *       required, consider forming first the resulting scalar using level-0
+	 *       primitives, and then using #grb::set, #grb::foldl, or #grb::foldr, as
+	 *       appropriate.
+	 *
+	 * Primitives that produce scalar output:
+	 *   -# #grb::foldr (reduction to the right, vector-to-scalar);
+	 *   -# #grb::foldl (reduction to the left, vector-to-scalar).
+	 *
+	 * Primitives that do not require an operator, monoid, or semiring:
+	 *   -# #grb::set (three variants).
+	 *
+	 * Primitives that could take an operator (see #grb::operators):
+	 *   -# #grb::foldr, #grb::foldl, and #grb::eWiseApply.
+	 * Such operators typically can only be applied on \em dense vectors, i.e.,
+	 * vectors with #grb::nnz equal to its #grb::size. Operations on sparse
+	 * vectors require an intepretation of missing vector elements, which monoids
+	 * or semirings provide.
+	 *
+	 * Therefore, all aforementioned functions are also defined for monoids instead
+	 * of operators.
+	 *
+	 * The following functions are defined for monoids and semirings, but not for
+	 * operators alone:
+	 *   -# #grb::eWiseAdd (in-place addition).
+	 *
+	 * The following functions require a semiring, and are not defined for
+	 * operators or monoids alone:
+	 *   -# #grb::dot (in-place reduction of two vectors into a scalar); and
+	 *   -# #grb::eWiseMul (in-place multiplication).
+	 *
+	 * Sometimes, operations that are defined for semirings we would sometimes also
+	 * like enabled on \em improper semirings. ALP/GraphBLAS statically checks most
+	 * properties required for composing proper semirings, and as such, attempts to
+	 * compose improper ones will result in a compilation error. In such cases, we
+	 * allow to pass an additive monoid and a multiplicative operator instead of a
+	 * semiring. The following functions allow this:
+	 *   -# #grb::dot, #grb::eWiseAdd, #grb::eWiseMul.
+	 * The given multiplicative operator can be any binary operator, and in
+	 * particular does not need to be associative.
+	 *
+	 * The algebraic structures lost with improper semirings typically correspond to
+	 * distributivity, zero being an annihilator to multiplication, as well as the
+	 * concept of \em one. Due to the latter lost structure, the above functions on
+	 * impure semirings are \em not defined for pattern inputs.
+	 *
+	 * \warning I.e., any attempt to use containers of the form
+	 *          \code
+	 *              grb::Vector<void>
+	 *              grb::Matrix<void>
+	 *          \endcode
+	 *          with an improper semiring will result in a compile-time error.
+	 *
+	 * \note Pattern containers are perfectly fine to use with proper semirings.
+	 *
+	 * \warning If an improper semiring does not have the property that the zero
+	 *          identity acts as an annihilator over the multiplicative operator,
+	 *          then the result of #grb::eWiseMul may be unintuitive. Please take
+	 *          great care in the use of improper semrings.
+	 *
+	 * For fusing multiple BLAS-1 style operations on any number of inputs and
+	 * outputs, users can pass their own operator function to be executed for
+	 * every index \a i.
+	 *   -# grb::eWiseLambda.
+	 * This requires manual application of operators, monoids, and/or semirings
+	 * via level-0 interface -- see #grb::apply, #grb::foldl, and #grb::foldr.
+	 *
+	 * For all of these functions, the element types of input and output types
+	 * do not have to match the domains of the given operator, monoid, or
+	 * semiring unless the #grb::descriptors::no_casting descriptor was passed.
+	 *
+	 * An implementation, whether blocking or non-blocking, should have clear
+	 * performance semantics for every sequence of graphBLAS calls, no matter
+	 * whether those are made from sequential or parallel contexts. Backends
+	 * may define different performance semantics depending on which #grb::Phase
+	 * primitives execute in.
+	 *
+	 * @{
+	 */
+
+	/**
+	 * A standard vector to use for mask parameters.
+	 *
+	 * Indicates that no mask shall be used.
+	 *
+	 * \internal Do not use this symbol within backend implementations.
+	 */
+	#define NO_MASK Vector< bool >( 0 )
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, operator version.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in] alpha The left-hand input scalar.
+	 * @param[in]  beta The right-hand input scalar.
+	 * @param[in]   op  The operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * Specialisation scalar inputs, unmasked operator version.
+	 *
+	 * A call to this function is equivalent to the following code:
+	 *
+	 * \code
+	 * typename OP::D3 tmp;
+	 * grb::apply( tmp, x, y, op );
+	 * grb::set( z, tmp, phase );
+	 * \endcode
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpASS_base = false;
+		assert( should_not_call_eWiseApplyOpASS_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, operator and masked
+	 * version.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in]  mask The ouptut mask.
+	 * @param[in] alpha The left-hand input scalar.
+	 * @param[in]  beta The right-hand input scalar.
+	 * @param[in]   op  The operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * Specialisation scalar inputs, masked operator version.
+	 *
+	 * A call to this function is equivalent to the following code:
+	 *
+	 * \code
+	 * typename OP::D3 tmp;
+	 * grb::apply( tmp, x, y, op );
+	 * grb::set( z, mask, tmp, phase );
+	 * \endcode
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAMSS_base = false;
+		assert( should_not_call_eWiseApplyOpAMSS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) beta;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, monoid version.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]  alpha The left-hand input scalar.
+	 * @param[in]   beta The right-hand input scalar.
+	 * @param[in] monoid The monoid with underlying operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * Specialisation scalar inputs, unmasked monoid version.
+	 *
+	 * A call to this function is equivalent to the following code:
+	 *
+	 * \code
+	 * typename OP::D3 tmp;
+	 * grb::apply( tmp, x, y, monoid.getOperator() );
+	 * grb::set( z, tmp, phase );
+	 * \endcode
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonASS_base = false;
+		assert( should_not_call_eWiseApplyMonASS_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, masked monoid
+	 * version.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the output mask vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]  mask  The output mask.
+	 * @param[in]  alpha The left-hand input scalar.
+	 * @param[in]  beta  The right-hand input scalar.
+	 * @param[in] monoid The monoid with underlying operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * Specialisation for scalar inputs, masked monoid version.
+	 *
+	 * A call to this function is equivalent to the following code:
+	 *
+	 * \code
+	 * typename OP::D3 tmp;
+	 * grb::apply( tmp, alpha, beta, monoid.getOperator() );
+	 * grb::set( z, mask, tmp, phase );
+	 * \endcode
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonAMSS_base = false;
+		assert( should_not_call_eWiseApplyMonAMSS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) beta;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, operator version.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+	 * are removed. Entries \a i for which \a y has no nonzero will be skipped.
+	 *
+	 * After a successful call to this primitive, the sparsity structure of \a z
+	 * shall match that of \a y.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in] alpha The left-hand input scalar.
+	 * @param[in]   y   The right-hand input vector.
+	 * @param[in]  op   The operator \f$ \odot \f$.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpASA_base = false;
+		assert( should_not_call_eWiseApplyOpASA_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, masked operator version.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+	 * are removed. Entries \a i for which \a y has no nonzero will be skipped, as
+	 * will entries \a i for which \a mask evaluates <tt>false</tt>.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the mask vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in]  mask The output mask.
+	 * @param[in] alpha The left-hand input scalar.
+	 * @param[in]   y   The right-hand input vector.
+	 * @param[in]  op   The operator \f$ \odot \f$.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAMSA_base = false;
+		assert( should_not_call_eWiseApplyOpAMSA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) y;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, monoid version.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+	 * are removed.
+	 *
+	 * After a successful call to this primitive, \a z shall be dense.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in] alpha  The left-hand input scalar.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonoidASA_base = false;
+		assert( should_not_call_eWiseApplyMonoidASA_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, masked monoid variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+	 * are removed. Entries \a i for which \a mask evaluates <tt>false</tt> will be
+	 * skipped.
+	 *
+	 * After a successful call to this primitive, the sparsity structure of \a z
+	 * shall match that of \a mask (after interpretation).
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the output mask vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[out] mask  The output mask.
+	 * @param[in] alpha  The left-hand input scalar.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, a y and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonoidAMSA_base = false;
+		assert( should_not_call_eWiseApplyMonoidAMSA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) y;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot \beta \f$, out of place, operator variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot \beta \f$. Any old entries
+	 * of \a z are removed.
+	 *
+	 * Entries \a i for which no nonzero exists in \a x are skipped. Therefore,
+	 * after a successful call to this primitive, the nonzero structure of \a z
+	 * will match that of \a x.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in]   x   The left-hand input vector.
+	 * @param[in]  beta The right-hand input scalar.
+	 * @param[in]   op  The operator \f$ \odot \f$.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAAS_base = false;
+		assert( should_not_call_eWiseApplyOpAAS_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot \beta \f$, out of place, masked operator variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot \beta \f$. Any old entries
+	 * of \a z are removed.
+	 *
+	 * Entries \a i for which no nonzero exists in \a x are skipped. Entries \a i
+	 * for which the mask evaluates <tt>false</tt> are skipped as well.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the output vector.
+	 * @tparam MaskType   The value type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector.
+	 * @param[in]  mask The output mask.
+	 * @param[in]   x   The left-hand input vector.
+	 * @param[in]  beta The right-hand input scalar.
+	 * @param[in]   op  The operator \f$ \odot \f$.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, operator, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAMAS_base = false;
+		assert( should_not_call_eWiseApplyOpAMAS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) beta;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot \beta \f$, out of place, monoid variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot \beta \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ x_i \odot \beta \f$. Any old entries of \a z
+	 * are removed.
+	 *
+	 * After a successful call to this primitive, \a z shall be dense.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]  beta  The right-hand input scalar.
+	 * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, monoid, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonoidAAS_base = false;
+		assert( should_not_call_eWiseApplyMonoidAAS_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot \beta \f$, out of place, masked monoid variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot \beta \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+	 * function completes equals \f$ x_i \odot \beta \f$. Any old entries of \a z
+	 * are removed. Entries \a i for which \a mask evaluates <tt>false</tt> will be
+	 * skipped.
+	 *
+	 * After a successful call to this primitive, the sparsity structure of \a z
+	 * matches that of \a mask (after interpretation).
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Equal to
+	 *                    descriptors::no_operation if left unspecified.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the mask vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[out] mask  The output mask.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]  beta  The right-hand input scalar.
+	 * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, monoid, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonoidAMAS_base = false;
+		assert( should_not_call_eWiseApplyMonoidAMAS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) beta;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place, operator variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+	 * of \a z are removed. Entries \a i which have no nonzero in either \a x or
+	 * \a y are skipped.
+	 *
+	 * After a successful call to this primitive, the nonzero structure of \a z
+	 * will match that of the intersection of \a x and \a y.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in]  op    The operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAAA_base = false;
+		assert( should_not_call_eWiseApplyOpAAA_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place, masked operator variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+	 * of \a z are removed. Entries \a i which have no nonzero in either \a x or
+	 * \a y are skipped, as will entries \a i for which \a mask evaluates
+	 * <tt>false</tt>.
+	 *
+	 * After a successful call to this primitive, the nonzero structure of \a z
+	 * will match that of the intersection of \a x and \a y.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam OP         The operator to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the output mask vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]  mask  The output mask.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in]  op    The operator \f$ \odot \f$.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+	 *                        \a z do not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], operator, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAMAA_base = false;
+		assert( should_not_call_eWiseApplyOpAMAA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) y;
+		(void) op;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place, monoid variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+	 * of \a z are removed.
+	 *
+	 * After a successful call to this primitive, the nonzero structure of \a z
+	 * will match that of the union of \a x and \a y. An implementing backend may
+	 * skip processing indices \a i that are not in the union of the nonzero
+	 * structure of \a x and \a y.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in] monoid The monoid structure that \f$ \odot \f$ corresponds to.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyOpAMAA_base = false;
+		assert( should_not_call_eWiseApplyOpAMAA_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Computes \f$ z = x \odot y \f$, out of place, masked monoid variant.
+	 *
+	 * Calculates the element-wise operation on one scalar to elements of one
+	 * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+	 * output vectors must be of equal length.
+	 *
+	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+	 * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+	 * of \a z are removed. Entries \a i for which \a mask evaluates <tt>false</tt>
+	 * will be skipped.
+	 *
+	 * \note When applying element-wise operators on sparse vectors using
+	 *       semirings, there is a difference between interpreting missing values
+	 *       as an annihilating identity or as a neutral identity-- intuitively,
+	 *       such identities are known as `zero' or `one', respectively. As a
+	 *       consequence, there are two different variants for element-wise
+	 *       operations whose names correspond to their intuitive meanings:
+	 *        - #grb::eWiseAdd (neutral), and
+	 *        - #grb::eWiseMul (annihilating).
+	 *       The above two primitives require a semiring. The same functionality is
+	 *       provided by #grb::eWiseApply depending on whether a monoid or operator
+	 *       is provided:
+	 *        - #grb::eWiseApply using monoids (neutral),
+	 *        - #grb::eWiseApply using operators (annihilating).
+	 *
+	 * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+	 *       while #grb::eWiseApply does not.
+	 *
+	 * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+	 *       additive monoid thus are equivalent if operating when operating on
+	 *       empty outputs.
+	 *
+	 * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+	 *       multiplicative operator thus are equivalent when operating on empty
+	 *       outputs.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Monoid     The monoid to use.
+	 * @tparam InputType1 The value type of the left-hand vector.
+	 * @tparam InputType2 The value type of the right-hand scalar.
+	 * @tparam OutputType The value type of the ouput vector.
+	 * @tparam MaskType   The value type of the mask vector.
+	 *
+	 * @param[out]  z    The output vector.
+	 * @param[in]  mask  The output mask.
+	 * @param[in]   x    The left-hand input vector.
+	 * @param[in]   y    The right-hand input vector.
+	 * @param[in] monoid The monoid structure that \f$ \odot \f$ corresponds to.
+	 * @param[in] phase  The #grb::Phase the call should execute. Optional; the
+	 *                   default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseApplyMonoidAMAA_base = false;
+		assert( should_not_call_eWiseApplyMonoidAMAA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) y;
+		(void) monoid;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+	 * under a given semiring.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1. This may
+	 *                  be a sparse vector.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2. This may
+	 *                  be a sparse vector.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * \note There is also a masked variant of #grb::eWiseAdd, as well as variants
+	 *       where \a x and/or \a y are scalars.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched; it will be as though this call was never
+	 *                        made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- [T2] + [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAAA_base = false;
+		assert( should_not_call_eWiseAddAAA_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+	 * given semiring.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2. This may
+	 *                  be a sparse vector.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                        match. All input data containers are left untouched;
+	 *                        it will be as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- T2 + [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddASA_base = false;
+		assert( should_not_call_eWiseAddASA_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+	 * given semiring.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1. This may
+	 *                  be a sparse vector.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+	 *                        match. All input data containers are left untouched;
+	 *                        it will be as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- [T2] + T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAAS_base = false;
+		assert( should_not_call_eWiseAddAAS_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+	 * given semiring.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
+	 * grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- T2 + T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddASS_base = false;
+		assert( should_not_call_eWiseAddASS_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+	 * under a given semiring, masked variant.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 * @tparam MaskType   The nonzero type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]  mask The output mask vector of type \a MaskType.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1. This may
+	 *                  be a sparse vector.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2. This may
+	 *                  be a sparse vector.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * \note There are also variants where \a x and/or \a y are scalars, as well
+	 *       as unmasked variants.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+	 *                        \a z do not match. All input data containers are left
+	 *                        untouched; it will be as though this call was never
+	 *                        made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- [T2] + [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAMAA_base = false;
+		assert( should_not_call_eWiseAddAMAA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+	 * given semiring, masked variant.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 * @tparam MaskType   The nonzero type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]  mask The output mask.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2. This may
+	 *                  be a sparse vector.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a y, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched; it will be as though this call was never
+	 *                        made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- T2 + [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAMSA_base = false;
+		assert( should_not_call_eWiseAddAMSA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+	 * given semiring, masked variant.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 * @tparam MaskType   The nonzero type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]  mask The output mask.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1. This may
+	 *                  be a sparse vector.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched; it will be as though this call was never
+	 *                        made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- [T2] + T3), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAMAS_base = false;
+		assert( should_not_call_eWiseAddAMAS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+	 * given semiring, masked variant.
+	 *
+	 * \note This is an in-place operation.
+	 *
+	 * \deprecated This function has been deprecated since v0.5. It may be removed
+	 *             at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+	 *
+	 * \note A call to this function is equivalent to two in-place fold operations
+	 *       using the additive monoid of the given semiring. Please update any
+	 *       code that calls #grb::eWiseAdd with such a sequence as soon as
+	 *       possible.
+	 *
+	 * \note We may consider providing this function as an algorithm in the
+	 *       #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+	 *       let the maintainers know if you would prefer such a solution over
+	 *       outright removal and replacement with two folds.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise addition
+	 *                    on.
+	 * @tparam InputType1 The left-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam InputType2 The right-hand side input type to the additive operator
+	 *                    of the \a ring.
+	 * @tparam OutputType The result type of the additive operator of the
+	 *                    \a ring.
+	 * @tparam MaskType   The nonzero type of the output mask vector.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType. This may be a
+	 *                  sparse vector.
+	 * @param[in]  mask The output mask.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH If \a mask and \a z do not have the same size.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseAdd ([T1] <- T2 + T3), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseAddAMSS_base = false;
+		assert( should_not_call_eWiseAddAMSS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+	 * under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output type.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting, and
+	 *  - #grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- [T2] * [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAAA_base = false;
+		assert( should_not_call_eWiseMulAAA_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of a scalar and vector,
+	 * \f$ z += \alpha .* y \f$, under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output type.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting, and
+	 *  - #grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- T2 * [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulASA_base = false;
+		assert( should_not_call_eWiseMulASA_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of a vector and scalar,
+	 * \f$ z += x .* \beta \f$, under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output type.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+	 *                        match. All input data containers are left untouched
+	 *                        if this exit code is returned; it will be as though
+	 *                        this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting, and
+	 *  - #grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- [T2] * T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAAS_base = false;
+		assert( should_not_call_eWiseMulAAS_base );
+#endif
+		(void) z;
+		(void) x;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of two scalars,
+	 * \f$ z += \alpha .* \beta \f$, under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output type.
+	 *
+	 * @param[out]  z   The output vector of type \a OutputType.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting, and
+	 *  - #grb::descriptors::dense.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- T2 * T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulASS_base = false;
+		assert( should_not_call_eWiseMulASS_base );
+#endif
+		(void) z;
+		(void) alpha;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+	 * under a given semiring, masked variant.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output vector type.
+	 * @tparam MaskType   The output mask type.
+	 *
+	 * @param[in,out] z The output vector of type \a OutputType.
+	 * @param[in]  mask The ouput mask of type \a MaskType.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+	 *                        \a z do not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- [T2] * [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAMAA_base = false;
+		assert( should_not_call_eWiseMulAMAA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of a scalar and vector,
+	 * \f$ z += \alpha .* y \f$, under a given semiring, masked variant.
 	 *
-	 * All functions return an error code of the enum-type #grb::RC.
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output vector type.
+	 * @tparam MaskType   The output mask type.
 	 *
-	 * Primitives which produce vector output:
-	 *   -# #grb::set (three variants);
-	 *   -# #grb::foldr (in-place reduction to the right, scalar-to-vector and
-	 *      vector-to-vector);
-	 *   -# #grb::foldl (in-place reduction to the left, scalar-to-vector and
-	 *      vector-to-vector);
-	 *   -# #grb::eWiseApply (out-of-place application of a binary function);
-	 *   -# #grb::eWiseAdd (in-place addition of two vectors, a vector and a
-	 *      scalar, into a vector); and
-	 *   -# #grb::eWiseMul (in-place multiplication of two vectors, a vector and a
-	 *      scalar, into a vector).
+	 * @param[in,out] z The output vector of type \a OutputType.
+	 * @param[in]  mask The ouput mask of type \a MaskType.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in]   y   The right-hand input vector of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
 	 *
-	 * \note When #grb::eWiseAdd or #grb::eWiseMul using two input scalars is
-	 *       required, consider forming first the resulting scalar using level-0
-	 *       primitives, and then using #grb::set, #grb::foldl, or #grb::foldr, as
-	 *       appropriate.
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a y, and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
 	 *
-	 * Primitives that produce scalar output:
-	 *   -# #grb::foldr (reduction to the right, vector-to-scalar);
-	 *   -# #grb::foldl (reduction to the left, vector-to-scalar).
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
 	 *
-	 * Primitives that do not require an operator, monoid, or semiring:
-	 *   -# #grb::set (three variants).
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
 	 *
-	 * Primitives that could take an operator (see #grb::operators):
-	 *   -# #grb::foldr, #grb::foldl, and #grb::eWiseApply.
-	 * Such operators typically can only be applied on \em dense vectors, i.e.,
-	 * vectors with #grb::nnz equal to its #grb::size. Operations on sparse
-	 * vectors require an intepretation of missing vector elements, which monoids
-	 * or semirings provide.
+	 * \note Invalid descriptors will be ignored.
 	 *
-	 * Therefore, all aforementioned functions are also defined for monoids instead
-	 * of operators.
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
 	 *
-	 * The following functions are defined for monoids and semirings, but not for
-	 * operators alone:
-	 *   -# #grb::eWiseAdd (in-place addition).
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * The following functions require a semiring, and are not defined for
-	 * operators or monoids alone:
-	 *   -# #grb::dot (in-place reduction of two vectors into a scalar); and
-	 *   -# #grb::eWiseMul (in-place multiplication).
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, backend, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- T2 * [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAMSA_base = false;
+		assert( should_not_call_eWiseMulAMSA_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) y;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * In-place element-wise multiplication of a vector and scalar,
+	 * \f$ z += x .* \beta \f$, under a given semiring, masked variant.
 	 *
-	 * Sometimes, operations that are defined for semirings we would sometimes also
-	 * like enabled on \em improper semirings. ALP/GraphBLAS statically checks most
-	 * properties required for composing proper semirings, and as such, attempts to
-	 * compose improper ones will result in a compilation error. In such cases, we
-	 * allow to pass an additive monoid and a multiplicative operator instead of a
-	 * semiring. The following functions allow this:
-	 *   -# #grb::dot, #grb::eWiseAdd, #grb::eWiseMul.
-	 * The given multiplicative operator can be any binary operator, and in
-	 * particular does not need to be associative.
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output vector type.
+	 * @tparam MaskType   The output mask type.
 	 *
-	 * The algebraic structures lost with improper semirings typically correspond to
-	 * distributivity, zero being an annihilator to multiplication, as well as the
-	 * concept of \em one. Due to the latter lost structure, the above functions on
-	 * impure semirings are \em not defined for pattern inputs.
+	 * @param[in,out] z The output vector of type \a OutputType.
+	 * @param[in]  mask The output mask of type \a MaskType.
+	 * @param[in]   x   The left-hand input vector of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
 	 *
-	 * \warning I.e., any attempt to use containers of the form
-	 *          \code
-	 *              grb::Vector<void>
-	 *              grb::Matrix<void>
-	 *          \endcode
-	 *          with an improper semiring will result in a compile-time error.
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x and \a z do
+	 *                        not match. All input data containers are left
+	 *                        untouched if this exit code is returned; it will be
+	 *                        as though this call was never made.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
 	 *
-	 * \note Pattern containers are perfectly fine to use with proper semirings.
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
 	 *
-	 * \warning If an improper semiring does not have the property that the zero
-	 *          identity acts as an annihilator over the multiplicative operator,
-	 *          then the result of #grb::eWiseMul may be unintuitive. Please take
-	 *          great care in the use of improper semrings.
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
 	 *
-	 * For fusing multiple BLAS-1 style operations on any number of inputs and
-	 * outputs, users can pass their own operator function to be executed for
-	 * every index \a i.
-	 *   -# grb::eWiseLambda.
-	 * This requires manual application of operators, monoids, and/or semirings
-	 * via level-0 interface -- see #grb::apply, #grb::foldl, and #grb::foldr.
+	 * \note Invalid descriptors will be ignored.
 	 *
-	 * For all of these functions, the element types of input and output types
-	 * do not have to match the domains of the given operator, monoid, or
-	 * semiring unless the #grb::descriptors::no_casting descriptor was passed.
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
 	 *
-	 * An implementation, whether blocking or non-blocking, should have clear
-	 * performance semantics for every sequence of graphBLAS calls, no matter
-	 * whether those are made from sequential or parallel contexts. Backends
-	 * may define different performance semantics depending on which #grb::Phase
-	 * primitives execute in.
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * @{
+	 * @see perfSemantics
 	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const Vector< InputType1, backend, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- [T2] * T3), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAMAS_base = false;
+		assert( should_not_call_eWiseMulAMAS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) x;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
 
 	/**
-	 * A standard vector to use for mask parameters.
+	 * In-place element-wise multiplication of two scalars,
+	 * \f$ z += \alpha .* \beta \f$, under a given semiring, masked variant.
 	 *
-	 * Indicates that no mask shall be used.
+	 * @tparam descr      The descriptor to be used. Optional; the default is
+	 *                    #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to perform the element-wise multiply
+	 *                    with.
+	 * @tparam InputType1 The left-hand side input type.
+	 * @tparam InputType2 The right-hand side input type.
+	 * @tparam OutputType The output vector type.
+	 * @tparam MaskType   The output mask type.
 	 *
-	 * \internal Do not use this symbol within backend implementations.
+	 * @param[in,out] z The output vector of type \a OutputType.
+	 * @param[in]  mask The ouput mask of type \a MaskType.
+	 * @param[in] alpha The left-hand input scalar of type \a InputType1.
+	 * @param[in] beta  The right-hand input scalar of type \a InputType2.
+	 * @param[in] ring  The generalized semiring under which to perform this
+	 *                  element-wise multiplication.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH If \a mask and \a z have different size.
+	 * @return #grb::FAILED   If \a phase is #grb::EXECUTE, indicates that the
+	 *                        capacity of \a z was insufficient. The output vector
+	 *                        \a z is cleared, and the call to this function has no
+	 *                        further effects.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+	 *                        out-of-memory exception. The call to this function
+	 *                        shall have no other effects beyond returning this
+	 *                        error code; the previous state of \a z is retained.
+	 * @return #grb::PANIC    A general unmitigable error has been encountered. If
+	 *                        returned, ALP enters an undefined state and the user
+	 *                        program is encouraged to exit as quickly as possible.
+	 *
+	 * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+	 *          missing elements in sparse input vectors are now interpreted as a
+	 *          the zero identity, therefore annihilating instead of acting as a
+	 *          monoid identity. Therefore even when \a z is empty on input, the
+	 *          #grb::eWiseApply with monoids does not incur the same behaviour as
+	 *          this function. The #grb::eWiseApply with operators \em is similar,
+	 *          except that this function is in-place and #grb::eWiseApply is not.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *  - #grb::descriptors::no_operation,
+	 *  - #grb::descriptors::no_casting,
+	 *  - #grb::descriptors::dense,
+	 *  - #grb::descriptors::invert_mask,
+	 *  - #grb::descriptors::structural, and
+	 *  - #grb::descriptors::structural_complement.
+	 *
+	 * \note Invalid descriptors will be ignored.
+	 *
+	 * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+	 * one of these is not true, the code shall not compile.
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
-	#define NO_MASK Vector< bool >( 0 )
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, enum Backend backend,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, backend, Coords > &z,
+		const Vector< MaskType, backend, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "in eWiseMul ([T1] <- T2 * T3), masked, base";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_eWiseMulAMSS_base = false;
+		assert( should_not_call_eWiseMulAMSS_base );
+#endif
+		(void) z;
+		(void) mask;
+		(void) alpha;
+		(void) beta;
+		(void) ring;
+		(void) phase;
+		return UNSUPPORTED;
+	}
 
 	/**
-	 * Executes an arbitrary element-wise user-defined function \a f using any
-	 * number of vectors of equal length, following the nonzero pattern of the
-	 * given vector \a x.
-	 *
-	 * The user-defined function is passed as a lambda which can capture, at
-	 * the very least, other instances of type grb::Vector. Use of this function
-	 * is preferable whenever multiple element-wise operations are requested that
-	 * use one or more identical input vectors. Performing the computation one
-	 * after the other in blocking mode would require the same vector to be
-	 * streamed multiple times, while with this function the operations can be
-	 * fused explicitly instead.
-	 *
-	 * It shall always be legal to capture non-GraphBLAS objects for read access
-	 * only. It shall \em not be legal to capture instances of type grb::Matrix
-	 * for read and/or write access.
-	 *
-	 * If grb::Properties::writableCaptured evaluates true then captured
-	 * non-GraphBLAS objects can also be written to, not just read from. The
-	 * captured variable is, however, completely local to the calling user process
-	 * only-- it will not be synchronised between user processes.
-	 * As a rule of thumb, data-centric GraphBLAS implementations \em cannot
-	 * support this and will thus have grb::Properties::writableCaptured evaluate
-	 * to false. A portable GraphBLAS algorithm should provide a different code
-	 * path to handle this case.
-	 * When it is legal to write to captured scalar, this function can, e.g., be
-	 * used to perform reduction-like operations on any number of equally sized
-	 * input vectors.  This would be preferable to a chained number of calls to
-	 * grb::dot in case where some vectors are shared between subsequent calls,
-	 * for example; the shared vectors are streamed only once using this lambda-
-	 * enabled function.
-	 *
-	 * \warning The lambda shall only be executed on the data local to the user
-	 *          process calling this function! This is different from the various
-	 *          fold functions, or grb::dot, in that the semantics of those
-	 *          functions always end with a globally synchronised result. To
-	 *          achieve the same effect with user-defined lambdas, the users
-	 *          should manually prescribe how to combine the local results into
-	 *          global ones, for instance, by a subsequent call to
-	 *          grb::collectives<>::allreduce.
-	 *
-	 * \note This is an addition to the GraphBLAS. It is alike user-defined
-	 *       operators, monoids, and semirings, except it allows execution on
-	 *       arbitrarily many inputs and arbitrarily many outputs.
+	 * Executes an arbitrary element-wise user-defined function \a f on any number
+	 * of vectors of equal length.
+	 *
+	 * \warning This is a relatively advanced function. It is recommended to read
+	 *          this specifications and its warnings before using it, or to instead
+	 *          exclusively only use the other primitives in \ref BLAS1.
+	 *
+	 * The vectors touched by \a f can be accessed in a read-only or a read/write
+	 * fashion. The function \a f must be parametrised in a global index \em i, and
+	 * \a f is only allowed to access elements of the captured vectors <em>on that
+	 * specific index</em>.
+	 *
+	 * \warning Any attempt to access a vector element at a position differing
+	 *          from \em i will result in undefined behaviour.
+	 *
+	 * All vectors captured by \a f must furthermore all be given as additional
+	 * (variadic) arguments to this primitive. Captured vectors can only be used
+	 * for dereferencing elements at a given position \em i; any other use invokes
+	 * undefined behaviour.
+	 *
+	 * \warning In particular, captured vectors may not be passed to other
+	 *          ALP/GraphBLAS primitives \em within \a f.
+	 *
+	 * This primitive will execute \a f on all indices where the first given such
+	 * vector argument has nonzeroes. All other indices \em i will be ignored.
+	 *
+	 * \warning Therefore, for containers of which \a f references the \em i-th
+	 *          element, must indeed have a nonzero at position \em i or otherwise
+	 *          undefined behaviour is invoked.
+	 *
+	 * This primitive hence allows a user to implement any level-1 like BLAS
+	 * functionality over any number of input/output vectors, and also allows to
+	 * compute multiple level-1 (like) BLAS functionalities as a single pass over
+	 * the involved containers.
+	 *
+	 * \note Since the introduction of the nonblocking backend, rewriting \a f in
+	 *       terms of native ALP/GraphBLAS primitives no longer implies performance
+	 *       penalties (when compiling for the nonblocking backend)-- rather, the
+	 *       nonblocking backend is likely to do better than manually fusing
+	 *       multiple level-1 like operations using this primitive, especially when
+	 *       the captured vectors are small relative to the private caches on the
+	 *       target architecture.
+	 *
+	 * The function \a f may also capture scalars for read-only access.
+	 *
+	 * \note As a convention, consider always passing scalars by value, since
+	 *       otherwise the compilation of your code with a non-blocking backend
+	 *       may (likely) result in data races.
+	 *
+	 * If #grb::Properties::writableCaptured evaluates <tt>true</tt> then captured
+	 * scalars may also safely be written to, instead of requiring to be read-only.
+	 *
+	 * \note This is useful for fusing reductions within other level-1 like
+	 *       operations.
+	 *
+	 * \warning If updating scalars using this primitive, be aware that the
+	 *          updates are local to the current user process only.
+	 *
+	 * \note If, after execution of this primitive, an updated scalar is expected
+	 *       to be synchronised across all user processes, see #grb::collectives.
+	 *
+	 * \note As a rule of thumb, parallel GraphBLAS implementations, due to being
+	 *       data-centric, \em cannot support writeable scalar captures and will
+	 *       have #grb::Properties::writableCaptured evaluate to <tt>false</tt>.
+	 *
+	 * \note A portable ALP/GraphBLAS algorithm should therefore either not rely on
+	 *       read/write captured scalars passed to this primitive, \em or provide
+	 *       different code paths to handle the two cases of the
+	 *       #grb::Properties::writableCaptured backend property.
+	 *
+	 * \note If the above sounds too tedious, consider rewriting \a f in terms of
+	 *       native ALP/GraphBLAS functions, with the scalar reductions performed by
+	 *       the scalar variants of #grb::foldl and #grb::foldr, e.g.
+	 *
+	 * \warning When compiling with a blocking backend, rewriting \a f in terms of
+	 *          native GraphBLAS primitives typically results in a slowdown due to
+	 *          this primitive naturally fusing potentially multiple operations
+	 *          together (which was the original motivation of Yzelman et al., 2020
+	 *          for introducing this primitive. Rewriting \a f into a (sequence of)
+	 *          native GraphBLAS primtives does \em not carry a performance when
+	 *          compiling with a nonblocking backend, however.
+	 *
+	 * \note This is an addition to the GraphBLAS C specification. It is alike
+	 *       user-defined operators, monoids, and semirings, except that this
+	 *       primitive allows execution on arbitrarily many inputs and arbitrarily
+	 *       many outputs.
 	 *
 	 * @tparam Func the user-defined lambda function type.
 	 * @tparam DataType the type of the user-supplied vector example.
@@ -200,46 +3617,46 @@ namespace grb {
 	 * @param[in] f The user-supplied lambda. This lambda should only capture
 	 *              and reference vectors of the same length as \a x. The lambda
 	 *              function should prescribe the operations required to execute
-	 *              at a given index \a i. Captured GraphBLAS vectors can access
-	 *              that element via the operator[]. It is illegal to access any
-	 *              element not at position \a i. The lambda takes only the single
-	 *              parameter \a i of type <code>const size_t</code>. Captured
-	 *              scalars will not be globally updated-- the user must program
-	 *              this explicitly. Scalars and other non-GraphBLAS containers
-	 *              are always local to their user process.
+	 *              at a given index \a i. Captured ALP/GraphBLAS vectors can
+	 *              access that element via the operator[]. It is illegal to access
+	 *              any element not at position \a i. The lambda takes only the
+	 *              single parameter \a i of type <code>const size_t</code>.
+	 *              Captured scalars will not be globally updated-- the user must
+	 *              program this explicitly. Scalars and other non-GraphBLAS
+	 *              containers are always local to their user process.
 	 * @param[in] x The vector the lambda will be executed on. This argument
 	 *              determines which indices \a i will be accessed during the
 	 *              elementwise operation-- elements with indices \a i that
 	 *              do not appear in \a x will be skipped during evaluation of
 	 *              \a f.
-	 * @param[in] args All vectors the lambda is to access elements of. Must be of
-	 *                 the same length as \a x. If this constraint is violated,
-	 *                 grb::MISMATCH shall be returned. <em>This is a variadic
-	 *                 argument and can contain any number of containers of type
-	 *                 grb::Vector, passed as though they were separate
-	 *                 arguments.</em>
-	 *
-	 * \note In future GraphBLAS implementations, \a args, apart from doing
-	 *       dimension checking, should also facilitate any data distribution
-	 *       necessary to successfully execute the element-wise operation. Current
-	 *       implementations do not require this since they use the same static
-	 *       distribution for all containers.
-	 *
-	 * \warning Using a grb::Vector inside a lambda passed to this function while
-	 *          not passing that same vector into \a args, will result in undefined
-	 *          behaviour.
-	 *
-	 * \note It would be natural to have \a x equal to one of the captured
-	 *       GraphBLAS vectors in \a f.
+	 *
+	 * The remaining arguments must collect all vectors the lambda is to access
+	 * elements of. Such vectors must be of the same length as \a x. If this
+	 * constraint is violated, #grb::MISMATCH shall be returned.
+	 *
+	 * \note These are passed using variadic arguments and so can contain any
+	 *       number of containers of type #grb::Vector.
+	 *
+	 * \note Distributed-memory ALP/GraphBLAS backends, apart from performing
+	 *       dimension checking, may also require data redistribution in case that
+	 *       different vectors are distributed differently.
+	 *
+	 * \warning Using a #grb::Vector inside a lambda passed to this function while
+	 *          not passing that same vector into its variadic argument list, will
+	 *          result in undefined behaviour.
 	 *
 	 * \warning Due to the constraints on \a f described above, it is illegal to
 	 *          capture some vector \a y and have the following line in the body
 	 *          of \a f: <code>x[i] += x[i+1]</code>. Vectors can only be
 	 *          dereferenced at position \a i and \a i alone.
 	 *
-	 * @return grb::SUCCESS  When the lambda is successfully executed.
-	 * @return grb::MISMATCH When two or more vectors passed to \a args are not of
-	 *                       equal length.
+	 * @return #grb::SUCCESS  When the lambda is successfully executed.
+	 * @return #grb::MISMATCH When two or more vectors passed to \a args are not of
+	 *                        equal length.
+	 * @return #grb::PANIC    When ALP/GraphBLAS has encountered an unrecoverable
+	 *                        error. The state of ALP becomes undefined after
+	 *                        having returned this error code, and users can only
+	 *                        attempt to exit the application gracefully.
 	 *
 	 * \parblock
 	 * \par Example.
@@ -285,9 +3702,11 @@ namespace grb {
 	 * grb::dot( alpha, x, y, ring );
 	 * \endcode
 	 *
-	 * The version using the lambdas, however, is expected to execute
-	 * faster as both \a x and \a y are streamed only once, while the
-	 * latter code may stream both vectors twice.
+	 * If the latter code block is compiled using a blocking ALP/GraphBLAS backend,
+	 * the version using the lambdas is expected to execute faster as both \a x and
+	 * \a y are streamed only once, while the latter code may stream both vectors
+	 * twice. This performance difference disappears when compiling the latter code
+	 * block using a nonblocking backend instead.
 	 * \endparblock
 	 *
 	 * \warning The following code is invalid:
@@ -306,17 +3725,16 @@ namespace grb {
 	 *          Only a Vector::lambda_reference to position exactly equal to \a i
 	 *          may be used within this function.
 	 *
-	 * \warning There is no similar concept in the official GraphBLAS specs.
-	 *
-	 * \warning Captured scalars will be local to the user process executing the
-	 *          lambda. To retrieve the global dot product, an allreduce must
-	 *          explicitly be called.
-	 *
 	 * @see Vector::operator[]()
 	 * @see Vector::lambda_reference
 	 *
-	 * \todo Revise specification regarding recent changes on phases, performance
-	 *       semantics, and capacities.
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive. It is
+	 * expected that the defined performance semantics depend on the given lambda
+	 * function \a f, the size of the containers passed into this primitive, as
+	 * well as how many containers are passed into this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
 		typename Func,
@@ -333,8 +3751,8 @@ namespace grb {
 		const bool should_not_call_base_vector_ewiselambda = false;
 		assert( should_not_call_base_vector_ewiselambda );
 #endif
-		(void)f;
-		(void)x;
+		(void) f;
+		(void) x;
 		return UNSUPPORTED;
 	}
 
@@ -375,9 +3793,9 @@ namespace grb {
 	 * @tparam Monoid    The monoid to use for reduction.
 	 * @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
 	 *                   vector \a y.
+	 * @tparam IOType    The type of the output scalar \a x.
 	 * @tparam MaskType  The type of the elements in the supplied ALP/GraphBLAS
 	 *                   vector \a mask.
-	 * @tparam IOType    The type of the output scalar \a x.
 	 *
 	 * @param[out]   x   The result of the reduction.
 	 * @param[in]    y   Any ALP/GraphBLAS vector. This vector may be sparse.
@@ -390,6 +3808,9 @@ namespace grb {
 	 * @return grb::ILLEGAL  If the provided input vector \a y was not dense, while
 	 *                       #grb::descriptors::dense was given.
 	 *
+	 * @see grb::foldr provides similar in-place functionality.
+	 * @see grb::eWiseApply provides out-of-place semantics.
+	 *
 	 * \parblock
 	 * \par Valid descriptors
 	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
@@ -405,18 +3826,10 @@ namespace grb {
 	 * shall not compile.
 	 * \endparblock
 	 *
-	 * \parblock
 	 * \par Performance semantics
-	 * Backends must specify performance semantics in the amount of work, intra-
-	 * process data movement, inter-process data movement, and the number of
-	 * user process synchronisations required. They should also specify whether
-	 * any system calls may be made, in particularly those related to dynamic
-	 * memory management. If new memory may be allocated, they must specify how
-	 * much.
-	 * \endparblock
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * @see grb::foldr provides similar in-place functionality.
-	 * @see grb::eWiseApply provides out-of-place semantics.
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -480,7 +3893,7 @@ namespace grb {
 	/**
 	 * Folds a vector into a scalar, left-to-right.
 	 *
-	 * Unmasked operator variant.
+	 * Unmasked operator variant. See masked variant for the full documentation.
 	 *
 	 * \deprecated This signature is deprecated. It was implemented for reference
 	 *             (and reference_omp), but could not be implemented for BSP1D and
@@ -581,18 +3994,147 @@ namespace grb {
 	}
 
 	/**
-	 * Dot product over a given semiring.
+	 * Calculates the dot product, \f$ z += (x,y) \f$, under a given additive
+	 * monoid and multiplicative operator.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; the default
+	 *                    descriptors is #grb::descriptors::no_operation.
+	 * @tparam AddMonoid  The monoid used for addition during the computation of
+	 *                    \f$ (x,y) \f$. The same monoid is used for accumulating
+	 *                    the result into a given scalar.
+	 * @tparam AnyOp      A binary operator that acts as the multiplication during
+	 *                    \f$ (x,y) \f$.
+	 * @tparam OutputType The output type.
+	 * @tparam InputType1 The input element type of the left-hand input vector.
+	 * @tparam InputType2 The input element type of the right-hand input vector.
+	 *
+	 * @param[in,out]  z    Where to fold \f$ (x,y) \f$ into.
+	 * @param[in]      x    The left-hand input vector.
+	 * @param[in]      y    The right-hand input vector.
+	 * @param[in] addMonoid The additive monoid under which the reduction of the
+	 *                      results of element-wise multiplications of \a x and
+	 *                      \a y are performed.
+	 * @param[in]   anyOp   The multiplicative operator using which element-wise
+	 *                      multiplications of \a x and \a y are performed. This
+	 *                      may be any binary operator.
+	 * @param[in]   phase   The #grb::Phase the call should execute. Optional; the
+	 *                      default parameter is #grb::EXECUTE.
+	 *
+	 * \note By this primitive by which a dot-product operates under any additive
+	 *       monoid and any binary operator, it follows that a dot product under
+	 *       any semiring can be reduced to a call to this primitive instead.
+	 *
+	 * @return #grb::MISMATCH When the dimensions of \a x and \a y do not match.
+	 *                        All input data containers are left untouched if this
+	 *                        exit code is returned; it will be as though this call
+	 *                        was never made.
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   -# grb::descriptors::no_operation
+	 *   -# grb::descriptors::no_casting
+	 *   -# grb::descriptors::dense
+	 *
+	 * If the dense descriptor is set, this implementation returns grb::ILLEGAL if
+	 * it was detected that either \a x or \a y was sparse. In this case, it shall
+	 * otherwise be as though the call to this function had not occurred (no side
+	 * effects).
+	 * \endparblock
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AddMonoid, class AnyOp,
+		typename OutputType, typename InputType1, typename InputType2,
+		enum Backend backend, typename Coords
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< AddMonoid >::value &&
+			grb::is_operator< AnyOp >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "Should not call base grb::dot (monoid-operator version)\n";
+#endif
+#ifndef NDEBUG
+		const bool should_not_call_base_dot_monOp = false;
+		assert( should_not_call_base_dot_monOp );
+#endif
+		(void) z;
+		(void) x;
+		(void) y;
+		(void) addMonoid;
+		(void) anyOp;
+		(void) phase;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Calculates the dot product, \f$ z += (x,y) \f$, under a given semiring.
+	 *
+	 * @tparam descr      The descriptor to be used. Optional; default descriptor
+	 *                    is #grb::descriptors::no_operation.
+	 * @tparam Ring       The semiring type to use.
+	 * @tparam OutputType The output type.
+	 * @tparam InputType1 The input element type of the left-hand input vector.
+	 * @tparam InputType2 The input element type of the right-hand input vector.
+	 *
+	 * @param[in,out] z The output element \f$ z += (x,y) \f$.
+	 * @param[in]     x The left-hand input vector \a x.
+	 * @param[in]     y The right-hand input vector \a y.
+	 * @param[in]  ring The semiring under which to compute the dot product
+	 *                  \f$ (x,y) \f$. The additive monoid is used to accumulate
+	 *                  the dot product result into \a z.
+	 * @param[in] phase The #grb::Phase the call should execute. Optional; the
+	 *                  default parameter is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  On successful completion of this call.
+	 * @return #grb::MISMATCH If the dimensions of \a x and \a y do not match. All
+	 *                        input data containers are left untouched if this exit
+	 *                        code is returned; it will be as though this call was
+	 *                        never made.
+	 *
+	 * \parblock
+	 * \par Valid descriptors
+	 *   - grb::descriptors::no_operation
+	 *   - grb::descriptors::no_casting
+	 *   - grb::descriptors::dense
+	 *
+	 * If the dense descriptor is set, this implementation returns #grb::ILLEGAL if
+	 * it was detected that either \a x or \a y was sparse. In this case, it shall
+	 * otherwise be as though the call to this function had not occurred (no side
+	 * effects).
+	 * \endparblock
 	 *
-	 * \todo Write specification.
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename IOType, typename InputType1, typename InputType2,
 		Backend backend, typename Coords
 	>
-	RC dot( IOType &x,
-		const Vector< InputType1, backend, Coords > &left,
-		const Vector< InputType2, backend, Coords > &right,
+	RC dot(
+		IOType &z,
+		const Vector< InputType1, backend, Coords > &x,
+		const Vector< InputType2, backend, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
@@ -609,9 +4151,9 @@ namespace grb {
 		const bool should_not_call_base_dot_semiring = false;
 		assert( should_not_call_base_dot_semiring );
 #endif
+		(void) z;
 		(void) x;
-		(void) left;
-		(void) right;
+		(void) y;
 		(void) ring;
 		(void) phase;
 		return UNSUPPORTED;
diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 6b1bccf55..7f99122c5 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -18,7 +18,7 @@
 /**
  * @file
  *
- * Defines the GraphBLAS level 2 API.
+ * Defines the ALP/GraphBLAS level-2 API
  *
  * @author A. N. Yzelman
  * @date 30th of March 2017
@@ -39,10 +39,12 @@
 #include "matrix.hpp"
 #include "vector.hpp"
 
+
 namespace grb {
 
 	/**
-	 * \defgroup BLAS2 The Level-2 Basic Linear Algebra Subroutines (BLAS)
+	 * \defgroup BLAS2 Level-2 Primitives
+	 * \ingroup GraphBLAS
 	 *
 	 * A collection of functions that allow GraphBLAS operators, monoids, and
 	 * semirings work on a mix of zero-dimensional, one-dimensional, and
@@ -57,181 +59,414 @@ namespace grb {
 	 */
 
 	/**
-	 * Right-handed sparse matrix times vector multiplication, \f$ u = Av \f$.
-	 *
-	 * Let \f$ u \f$ and \f$ \mathit{mask} \f$ each be a #grb::Vector of #grb::size
-	 * \f$ m \f$, \f$ v \f$ be a #grb::Vector of #grb::size \f$ n \f$, and let
-	 * \f$ A \f$ be a #Matrix with #grb::nrows \f$ m \f$ and #grb::ncols \f$ n \f$.
-	 * Let furthermore \f$ z \f$ be an interal vector of size \f$ m \f$.
-	 * A call to this function first computes \f$ z = Av \f$ over the provided
-	 * \a ring. It then left-folds \f$ z \f$ into \f$ u \f$ using the provided
-	 * \a accumulator.
-	 *
-	 * @see Vector for an in-depth description of a GraphBLAS vector.
-	 * @see size   for retrieving the length of a given GraphBLAS vector.
-	 * @see Matrix for an in-depth description of a GraphBLAS matrix.
-	 * @see nrows  for retrieving the number of rows of a given GraphBLAS matrix.
-	 * @see ncols  for retrieving the number of columns of a given GraphBLAS
-	 *             vector.
-	 *
-	 * Formally, the exact operation executed is
-	 *  \f$ u_i^\mathit{out} = u_i^\mathit{in} \bigodot z_i, \f$
-	 * for all \f$ i \in \{ 0, 1, \ldots, m-1 \} \f$ for which
-	 * \f$ \mathit{mask}_i \f$ evaluates <tt>true</tt>. If there is a nonzero at
-	 * \f$ z_i \f$ but no nonzero at \f$ u_i^\mathit{in} \f$ then the latter is interpreted as the additive
-	 * identity \f$ \mathbf{0} \f$ of the given \a ring.
-	 * For \f$ z \f$, we formally have:
-	 *  \f$ z_i = \bigoplus{i=0}^{m-1} \left( A_{ij} \bigotimes v_j \right), \f$
-	 * where \f$ \bigodot \f$ represents the \a accumulator, \f$ \bigoplus \f$
-	 * represents the additive operator of the provided \a ring, and
-	 * \f$ \bigotimes \f$ represents the multiplicative operator of \a ring. If here
-	 * \f$ v_j \f$ does not exist, it is considered to be equal to the additive
-	 * identity of the given \a ring.
-	 *
-	 * \note The additive identity of a given \a ring is an annihilator of
-	 *       nonzeroes from \f$ A \f$ under the multiplicative operator of \a ring;
-	 *       that is, \f$ z_i \f$ will be \f$ \mathbf{0} \f$ always. This can, of
-	 *       course, be exploited during sparse matrix--sparse vector (SpMSpV)
-	 *       multiplication.
-	 *
-	 * \note A good implementation is very careful about forming \f$ z \f$
-	 *       explicitly and, even if it is formed already, is very careful about
-	 *       making use of \f$ z \f$. Making use of an explicit buffer will result
-	 *       in \f$ \Theta(m) \f$ data movement and may only be warrented when
-	 *       \f$ A \f$ has many nonzeroes per row and \f$ v \f$ is dense.
-	 *
-	 * @tparam descr    Any combination of one or more #grb::descriptors. When
-	 *                  ommitted, the default #grb::descriptors:no_operation will
-	 *                  be assumed.
-	 * @tparam Ring     The generalised semi-ring the matrix--vector multiplication
-	 *                  is to be executed under.
-	 * @tparam IOType   The type of the elements of the output vector \a u.
+	 * Right-handed in-place doubly-masked sparse matrix times vector
+	 * multiplication, \f$ u = u + Av \f$.
+	 *
+	 * Aliases to this function exist that do not include masks:
+	 *  - grb::mxv( u, u_mask, A, v, semiring );
+	 *  - grb::mxv( u, A, v, semiring );
+	 * When masks are omitted, the semantics shall be the same as though a dense
+	 * Boolean vector of the appropriate size with all elements set to
+	 * <tt>true</tt> was given as a mask. We thus describe the semantics of the
+	 * fully masked variant only.
+	 *
+	 * \note If only an input mask \a v_mask is intended to be given (and no output
+	 *       mask \a u_mask), then \a u_mask must nonetheless be explicitly given.
+	 *       Passing an empty Boolean vector for \a u_mask is sufficient.
+	 *
+	 * Let \f$ u, \mathit{u\_mask} \f$ be vectors of size \f$ m \f$, let
+	 * \f$ v, \mathit{v\_mask} \f$ be vectors of size \f$ n \f$, and let
+	 * \f$ A \f$ be an \f$ m \times n \f$ matrix. Then, a call to this function
+	 * computes \f$ u = u + Av \f$ but:
+	 *   1. only for the elements \f$ u_i \f$ for which \f$ \mathit{u\_mask}_i \f$
+	 *      evaluates <tt>true</tt>; and
+	 *   2. only considering the elements \f$ v_j \f$ for which
+	 *      \f$ \mathit{v\_mask}_v \f$ evaluates <tt>true</tt>, and otherwise
+	 *      substituting the zero element under the given semiring.
+	 *
+	 * When multiplying a matrix nonzero element \f$ a_{ij} \in A \f$, it shall
+	 * be multiplied with an element \f$ x_j \f$ using the multiplicative operator
+	 * of the given \a semiring.
+	 *
+	 * When accumulating multiple contributions of multiplications of nonzeroes on
+	 * some row \f$ i \f$, the additive operator of the given \a semiring shall be
+	 * used.
+	 *
+	 * Nonzero resulting from computing \f$ Av \f$ are accumulated into any pre-
+	 * existing values in \f$ u \f$ by the additive operator of the given
+	 * \a semiring.
+	 *
+	 * If elements from \f$ v \f$, \f$ A \f$, or \f$ u \f$ were missing, the zero
+	 * identity of the given \a semiring is substituted.
+	 *
+	 * If nonzero values from \f$ A \f$ were missing, the one identity of the given
+	 * semiring is substituted.
+	 *
+	 * \note A nonzero in \f$ A \f$ may not have a nonzero value in case it is
+	 *       declared as <tt>grb::Matrix< void ></tt>.
+	 *
+	 * The following template arguments \em may be explicitly given:
+	 *
+	 * @tparam descr      Any combination of one or more #grb::descriptors. When
+	 *                    ommitted, the default #grb::descriptors:no_operation will
+	 *                    be assumed.
+	 * @tparam Semiring   The generalised semiring the matrix--vector
+	 *                    multiplication is to be executed under.
+	 *
+	 * The following template arguments will be inferred from the input arguments:
+	 *
+	 * @tparam IOType     The type of the elements of the output vector \a u.
 	 * @tparam InputType1 The type of the elements of the input vector \a v.
 	 * @tparam InputType2 The type of the elements of the input matrix \a A.
-	 * @tparam Operator The type of the \a accumulator. Must be a GraphBLAS
-	 *                  operator; see also #grb::operators.
-	 * @tparam InputType3 The type of the elements of the mask vector \a mask.
-	 * @tparam implementation Which back-end the given vectors and matrices belong
-	 *                        to. These must all belong to the same back-end.
-	 *
-	 * @param[in,out] u The output vector. Depending on the provided
-	 *                  \a accumulator, old vector values may affect new values.
-	 * @param[in]  mask The mask vector. The vector #grb::size must be equal to
-	 *                  that of \a u, \em or it must be equal to zero. A \a mask
-	 *                  of grb::size zero will be ignored (assumed <tt>true</tt>
-	 *                  always.
-	 * @param[in] accumulator The operator \f$ \bigodot \f$ in the above
-	 *                        description.
-	 * @param[in] A     The input matrix. Its #grb::nrows must equal the
-	 *                  #grb::size of \a u.
-	 * @param[in] v     The input vector. Its #grb::size must equal the
-	 *                  #grb::ncols of \a A.
-	 * @param[in] ring  The semiring to perform the matrix--vector multiplication
-	 *                  under. Unless #grb::descriptors::no_casting is defined,
-	 *                  elements from \a u, \a A, and \a v will be cast to the
-	 *                  domains of the additive and multiplicative operators of
-	 *                  \a ring as they are applied during the multiplication.
-	 *
-	 * \warning Even if #grb::operators::right_assign is provided as accumulator,
-	 *          old values of \a u may \em not be overwritten if the computation
-	 *          ends up not writing any new values to those values. To throw away
-	 *          old vector values use grb::descriptors::explicit_zero (for dense
-	 *          vectors only if you wish to retain sparsity of the output vector),
-	 *          or first simply use grb::clear on \a u.
+	 * @tparam InputType3 The type of the output mask (\a u_mask) elements.
+	 * @tparam InputType4 The type of the input mask (\a v_mask) elements.
+	 *
+	 * \internal
+	 * The following template arguments will be inferred from the input arguments
+	 * and generally do not concern end-users:
+	 *
+	 * @tparam Coords  Which coordinate class is used to maintain sparsity
+	 *                 structures.
+	 * @tparam RIT     The integer type used for row indices.
+	 * @tparam CIT     The integer type used for column indices.
+	 * @tparam NIT     The integer type used for nonzero indices.
+	 * @tparam backend The backend implementing the SpMV multiplication. The input
+	 *                 containers must all refer to the same backend.
+	 * \endinternal
+	 *
+	 * The following arguments are mandatory:
+	 *
+	 * @param[in,out] u    The output vector.
+	 * @param[in]     A    The input matrix. Its #grb::nrows must equal the
+	 *                     #grb::size of \a u.
+	 * @param[in]     v    The input vector. Its #grb::size must equal the
+	 *                     #grb::ncols of \a A.
+	 * @param[in] semiring The semiring to perform the matrix--vector
+	 *                     multiplication under. Unless
+	 *                     #grb::descriptors::no_casting is defined, elements from
+	 *                     \a u, \a A, and \a v will be cast to the domains of the
+	 *                     additive and multiplicative operators of \a semiring.
+	 *
+	 * The vector \a v may not be the same as \a u.
+	 *
+	 * Instead of passing a \a semiring, users may opt to provide an additive
+	 * commutative monoid and a binary multiplicative operator instead. In this
+	 * case, \a A may not be a pattern matrix (that is, it must not be of type
+	 * <tt>grb::Matrix< void ></tt>).
+	 *
+	 * The \a semiring (or the commutative monoid - binary operator pair) is
+	 * optional if they are passed as a template argument instead.
+	 *
+	 * \note When providing a commutative monoid - binary operator pair, ALP
+	 *       backends are precluded from employing distributative laws in
+	 *       generating optimised codes.
+	 *
+	 * Non-mandatory arguments are:
+	 *
+	 * @param[in] u_mask The output mask. The vector must be of equal size as \a u,
+	 *                   \em or it must be empty (have size zero).
+	 * @param[in] v_mask The input mask. The vector must be of equal size as \a v,
+	 *                   \em or it must be empty (have size zero).
+	 * @param[in] phase  The requested phase for this primitive-- see
+	 *                   #grb::Phase for details.
+	 *
+	 * The vectors \a u_mask and \a v_mask may never be the same as \a u.
+	 *
+	 * An empty \a u_mask will behave semantically the same as providing no mask;
+	 * i.e., as a mask that evaluates <tt>true</tt> at every position.
+	 *
+	 * If \a phase is not given, it will be set to the default #grb::EXECUTE.
+	 *
+	 * If \a phase is #grb::EXECUTE, then the capacity of \a u must be greater than
+	 * or equal to the capacity required to hold all output elements of the
+	 * requested computation.
 	 *
 	 * The above semantics may be changed by the following descriptors:
-	 *   * #descriptors::invert_mask: \f$ u_i^\mathit{out} \f$ will be written to
-	 *     if and only if \f$ \mathit{mask}_i \f$ evaluates <tt>false</tt>.
-	 *   * #descriptors::transpose_matrix: \f$ A \f$ is interpreted as \f$ A^T \f$
+	 *   - #descriptors::transpose_matrix: \f$ A \f$ is interpreted as \f$ A^T \f$
 	 *     instead.
-	 *   * #descriptors::structural: when evaluating \f$ \mathit{mask}_i \f$, only
-	 *     the structure of \f$ \mathit{mask} \f$ is considered (as opposed to its
-	 *     elements); if \f$ \mathit{mask} \f$ has a nonzero at its \f$ i \f$th
-	 *     index, it is considered to evaluate <tt>true</tt> no matter what the
-	 *     actual value of \f$ \mathit{mask}_i \f$ was.
-	 *   * #descriptors::structural_complement: a combination of two descriptors:
-	 *     #descriptors::structural and #descriptors::invert_mask (and thus
-	 *     equivalent to <tt>structural | invert_mask</tt>). Its net effect is if
-	 *     \f$ \mathit{mask} \f$ does \em not have a nonzero at the \f$ i \f$th
-	 *     index, the mask is considered to evaluate <tt>true</tt>.
-	 *   * #descriptors::add_identity: the matrix \f$ A \f$ is instead interpreted
-	 *     as \f$ A + \mathbf{1} \f$, where \f$ \mathbf{1} \f$ is the
-	 *     multiplicative identity of the given ring.
-	 *   * #descriptors::use_index: when referencing \f$ v_i \f$, if assigned, then
-	 *     instead of using the value itself, its index \f$ i \f$ is used instead.
-	 *   * #descriptors::in_place: the \a accumulator is ignored; the additive
-	 *     operator of the given \a ring is used in its place. Under certain
-	 *     conditions, an implementation can exploit this semantic to active
-	 *     faster computations.
-	 *   * #descriptors::explicit_zero: if \f$ \mathbf{0} \f$ would be assigned to
-	 *     a previously unassigned index, assign \f$ \mathbf{0} \f$ explicitly to
-	 *     that index. Here, \f$ \mathbf{0} \f$ is the additive identity of the
-	 *     provided \a ring.
-	 *
-	 * \parblock
+	 *   - #descriptors::add_identity: the matrix \f$ A \f$ is instead interpreted
+	 *     as \f$ A + \mathbf{1} \f$, where \f$ \mathbf{1} \f$ is the one identity
+	 *     (i.e., multiplicative identity) of the given \a semiring.
+	 *   - #descriptors::invert_mask: \f$ u_i \f$ will be written to if and only if
+	 *     \f$ \mathit{u\_mask}_i \f$ evaluates <tt>false</tt>, and \f$ v_j \f$
+	 *     will be read from if and only if \f$ \mathit{v\_mask}_j \f$ evaluates
+	 *     <tt>false</tt>.
+	 *   - #descriptors::structural: when evaluating \f$ \mathit{mask}_i \f$, only
+	 *     the structure of \f$ \mathit{u\_mask}, \mathit{v\_mask} \f$ is
+	 *     considered, as opposed to considering their values.
+	 *   - #descriptors::structural_complement: a combination of two descriptors:
+	 *     #descriptors::structural and #descriptors::invert_mask.
+	 *   - #descriptors::use_index: when reading \f$ v_i \f$, then, if there is
+	 *     indeed a nonzero \f$ v_i \f$, use the value \f$ i \f$ instead. This
+	 *     casts the index from <tt>size_t</tt> to the \a InputType1 of \a v.
+	 *   - #descriptors::explicit_zero: if \f$ u_i \f$ was unassigned on entry and
+	 *     if \f$ (Av)_i \f$ is \f$ \mathbf{0} \f$, then instead of leaving
+	 *     \f$ u_i \f$ unassigned, it is set to \f$ \mathbf{0} \f$ explicitly.
+	 *     Here, \f$ \mathbf{0} \f$ is the additive identity of the provided
+	 *     \a semiring.
+	 *   - #descriptors::safe_overlap: the vectors \a u and \a v may now be the
+	 *     same container. The user guarantees that no race conditions exist during
+	 *     the requested computation, however. The user may guarantee this due to a
+	 *     a very specific structure of \a A and \a v, or via an intelligently
+	 *     constructed \a u_mask, for example.
+	 *
+	 * @returns #grb::SUCCESS  If the computation completed successfully.
+	 * @returns #grb::MISMATCH If there is at least one mismatch between vector
+	 *                         dimensions or between vectors and the given matrix.
+	 * @returns #grb::OVERLAP  If two or more provided vectors refer to the same
+	 *                        container while this was not allowed.
+	 *
+	 * When any of the above non-SUCCESS error code is returned, it shall be as
+	 * though the call was never made-- the state of all container arguments and
+	 * of the application remain unchanged, save for the returned error code.
+	 *
+	 * @returns grb::PANIC Indicates that the application has entered an undefined
+	 *                     state.
+	 *
+	 * \note Should this error code be returned, the only sensible thing to do is
+	 *       exit the application as soon as possible, while refraining from using
+	 *       any other ALP pritimives.
+	 *
 	 * \par Performance semantics
-	 * Performance semantics vary depending on whether a mask was provided, and on
-	 * whether the input vector is sparse or dense. If the input vector \f$ v \f$
-	 * is sparse, let \f$ J \f$ be its set of assigned indices. If a non-trivial
-	 * mask \f$ \mathit{mask} \f$ is given, let \f$ I \f$ be the set of indices for
-	 * which the corresponding \f$ \mathit{mask}_i \f$ evaluate <tt>true</tt>. Then:
-	 *   -# For the performance guarantee on the amount of work this function
-	 *      entails the following table applies:<br>
-	 *      \f$ \begin{tabular}{cccc}
-	 *           Masked & Dense input  & Sparse input \\
-	 *           \noalign{\smallskip}
-	 *           no  & $\Theta(2\mathit{nnz}(A))$      & $\Theta(2\mathit{nnz}(A_{:,J}))$ \\
-	 *           yes & $\Theta(2\mathit{nnz}(A_{I,:})$ & $\Theta(\min\{2\mathit{nnz}(A_{I,:}),2\mathit{nnz}(A_{:,J})\})$
-	 *          \end{tabular}. \f$
-	 *   -# For the amount of data movements, the following table applies:<br>
-	 *      \f$ \begin{tabular}{cccc}
-	 *           Masked & Dense input  & Sparse input \\
-	 *           \noalign{\smallskip}
-	 *           no  & $\Theta(\mathit{nnz}(A)+\min\{m,n\}+m+n)$                         & $\Theta(\mathit{nnz}(A_{:,J}+\min\{m,2|J|\}+|J|)+\mathcal{O}(2m)$ \\
-	 *           yes & $\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n)$ &
-	 * $\Theta(\min\{\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n),\mathit{nnz}(A_{:,J}+\min\{m,|J|\}+2|J|)+\mathcal{O}(2m))$ \end{tabular}. \f$
-	 *   -# A call to this function under no circumstance will allocate nor free
-	 *      dynamic memory.
-	 *   -# A call to this function under no circumstance will make system calls.
-	 * The above performance bounds may be changed by the following desciptors:
-	 *   * #descriptors::invert_mask: replaces \f$ \Theta(|I|) \f$ data movement
-	 *     costs with a \f$ \mathcal{O}(2m) \f$ cost instead, or a
-	 *     \f$ \mathcal{O}(m) \f$ cost if #descriptors::structural was defined as
-	 *     well (see below). In other words, implementations are not required to
-	 *     implement inverted operations efficiently (\f$ 2\Theta(m-|I|) \f$ data
-	 *     movements would be optimal but costs another \f$ \Theta(m) \f$ memory
-	 *     to maintain).
-	 *   * #descriptors::structural: removes \f$ \Theta(|I|) \f$ data movement
-	 *     costs as the mask values need no longer be touched.
-	 *   * #descriptors::add_identity: adds, at most, the costs of grb::foldl
-	 *     (on vectors) to all performance metrics.
-	 *   * #descriptors::use_index: removes \f$ \Theta(n) \f$ or
-	 *     \f$ \Theta(|J|) \f$ data movement costs as the input vector values need
-	 *     no longer be touched.
-	 *   * #descriptors::in_place (see also above): turns \f$ \mathcal{O}(2m) \f$
-	 *     data movements into \f$ \mathcal{O}(m) \f$ instead; i.e., it halves the
-	 *     amount of data movements for writing the output.
-	 *   * #descriptors::dense: the input, output, and mask vectors are assumed to
-	 *     be dense. This allows the implementation to skip checks or other code
-	 *     blocks related to handling of sparse vectors. This may result in use of
-	 *     unitialised memory if any of the provided vectors were, in fact,
-	 *     sparse.
-	 * Implementations that support multiple user processes must characterise data
-	 * movement between then.
-	 * \endparblock
-	 *
-	 * @returns grb::SUCCESS  If the computation completed successfully.
-	 * @returns grb::MISMATCH If there is at least one mismatch between vector
-	 *                        dimensions or between vectors and the given matrix.
-	 * @returns grb::OVERLAP  If two or more provided vectors refer to the same
-	 *                        vector.
-	 *
-	 * When a non-SUCCESS error code is returned, it shall be as though the call
-	 * was never made. Note that all GraphBLAS functions may additionally return
-	 * #grb::PANIC, which indicates the library has entered an undefined state; if
-	 * this error code is returned, the only sensible thing a user can do is exit,
-	 * or at least refrain from using any GraphBLAS functions for the remainder of
-	 * the application.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		Backend backend
+	>
+	RC mxv(
+		Vector< IOType, backend, Coords > &u,
+		const Vector< InputType3, backend, Coords > &u_mask,
+		const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
+		const Vector< InputType1, backend, Coords > &v,
+		const Vector< InputType4, backend, Coords > &v_mask,
+		const Semiring &semiring = Semiring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement mxv "
+			<< "(doubly-masked, semiring)\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backed_does_not_support_doubly_masked_mxv_sr = false;
+		assert( selected_backed_does_not_support_doubly_masked_mxv_sr );
+#endif
+		(void) u;
+		(void) u_mask;
+		(void) A;
+		(void) v;
+		(void) v_mask;
+		(void) semiring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Left-handed in-place doubly-masked sparse matrix times vector
+	 * multiplication, \f$ u = u + vA \f$.
+	 *
+	 * A call to this function is exactly equivalent to calling
+	 *   - grb::vxm( u, u_mask, A, v, v_mask, semiring, phase )
+	 * with the #descriptors::transpose_matrix flipped.
+	 *
+	 * See the documentation of #grb::mxv for the full semantics of this function.
+	 * Like with #grb::mxv, aliases to this function exist that do not include
+	 * masks:
+	 *  - grb::vxm( u, u_mask, v, A, semiring, phase );
+	 *  - grb::vxm( u, v, A, semiring, phase );
+	 *
+	 * Similarly, aliases to this function exist that take an additive commutative
+	 * monoid and a multiplicative binary operator instead of a semiring.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Semiring,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4,
+		typename Coords, typename RIT, typename CIT, typename NIT,
+		enum Backend backend
+	>
+	RC vxm(
+		Vector< IOType, backend, Coords > &u,
+		const Vector< InputType3, backend, Coords > &u_mask,
+		const Vector< InputType1, backend, Coords > &v,
+		const Vector< InputType4, backend, Coords > &v_mask,
+		const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
+		const Semiring &semiring = Semiring(),
+		const Phase &phase = EXECUTE,
+		typename std::enable_if<
+			grb::is_semiring< Semiring >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!grb::is_object< IOType >::value,
+		void >::type * = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement doubly-masked grb::vxm\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_doubly_masked_vxm_sr = false;
+		assert( selected_backend_does_not_support_doubly_masked_vxm_sr );
+#endif
+		(void) u;
+		(void) u_mask;
+		(void) v;
+		(void) v_mask;
+		(void) A;
+		(void) semiring;
+		return UNSUPPORTED;
+	}
+
+	/**
+	 * Executes an arbitrary element-wise user-defined function \a f on all
+	 * nonzero elements of a given matrix \a A.
+	 *
+	 * The user-defined function is passed as a lambda which can capture whatever
+	 * the user would like, including one or multiple grb::Vector instances, or
+	 * multiple scalars. When capturing vectors, these should also be passed as a
+	 * additional arguments to this functions so to make sure those vectors are
+	 * synchronised for access on all row- and column- indices corresponding to
+	 * locally stored nonzeroes of \a A.
+	 *
+	 * Only the elements of a single matrix may be iterated upon.
+	 *
+	 * \note Rationale: while it is reasonable to expect an implementation be able
+	 *       to synchronise vector elements, it may be unreasonable to expect two
+	 *       different matrices can be jointly accessed via arbitrary lambda
+	 *       functions.
+	 *
+	 * \warning The lambda shall only be executed on the data local to the user
+	 *          process calling this function! This is different from the various
+	 *          fold functions, or grb::dot, in that the semantics of those
+	 *          functions always result in globally synchronised result. To
+	 *          achieve the same effect with user-defined lambdas, the users
+	 *          should manually prescribe how to combine the local results into
+	 *          global ones, for instance, by subsequent calls to
+	 *          grb::collectives.
+	 *
+	 * \note This is an addition to the GraphBLAS. It is alike user-defined
+	 *       operators, monoids, and semirings, except it allows execution on
+	 *       arbitrarily many inputs and arbitrarily many outputs.
+	 *
+	 * @tparam Func     the user-defined lambda function type.
+	 * @tparam DataType the type of the user-supplied matrix.
+	 * @tparam backend  the backend type of the user-supplied vector example.
+	 *
+	 * @param[in] f The user-supplied lambda. This lambda should only capture
+	 *              and reference vectors of the same length as either the row or
+	 *              column dimension length of \a A. The lambda function should
+	 *              prescribe the operations required to execute on a given
+	 *              reference to a matrix nonzero of \a A (of type \a DataType) at
+	 *              a given index \f$ (i,j) \f$. Captured GraphBLAS vectors can
+	 *              access corresponding elements via Vector::operator[] or
+	 *              Vector::operator(). It is illegal to access any element not at
+	 *              position \a i if the vector length is equal to the row
+	 *              dimension. It is illegal to access any element not at position
+	 *              \a j if the vector length is equal to the column dimension.
+	 *              Vectors of length neither equal to the column or row dimension
+	 *              may \em not be referenced or undefined behaviour will occur. The
+	 *              reference to the matrix nonzero is non \a const and may thus be
+	 *              modified. New nonzeroes may \em not be added through this lambda
+	 *              functionality. The function \a f must have the following
+	 *              signature:
+	 *              <code>(DataType &nz, const size_t i, const size_t j)</code>.
+	 *              The GraphBLAS implementation decides which nonzeroes of \a A are
+	 *              dereferenced, and thus also decides the values \a i and \a j the
+	 *              user function is evaluated on.
+	 * @param[in] A The matrix the lambda is to access the elements of.
+	 *
+	 * The remainder arguments should enumerate all vectors the lambda is to access
+	 * elements of. Each such vector must be of the same length as \a nrows(A) or
+	 * \a ncols(A). If this constraint is violated, #grb::MISMATCH shall be returned.
+	 * If a given vector length equals \a nrows(A), the vector shall be synchronized
+	 * for access on \a i. If the vector length equals \a ncols(A), the vector shall
+	 * be synchronized for access on \a j. If \a A is square, the vectors will be
+	 * synchronised for access on both \a i \em and \a j.
+	 *
+	 * \note These vectors are passed using a variadic argument list and so may
+	 *       contain any number of containers of type #grb::Vector, potentially with
+	 *       differing nonzero types, as separate arguments.
+	 *
+	 * \warning Using a #grb::Vector inside a lambda passed to this function while
+	 *          not passing that same vector into the variadic argument list will
+	 *          result in undefined behaviour.
+	 *
+	 * \warning Due to the constraints on \a f described above, it is illegal to
+	 *          capture some vector \a y and have the following line in the body
+	 *          of \a f: <code>x[i] += x[i+1]</code>. Vectors can only be
+	 *          dereferenced at position \a i and \a i alone, and similarly for
+	 *          access using \a j. For square matrices, however, the following
+	 *          code in the body is accepted, however: <code>x[i] += x[j]</code>.
+	 *
+	 * @return grb::SUCCESS  When the lambda is successfully executed.
+	 * @return grb::MISMATCH When two or more vectors passed into the variadic
+	 *                       argument list are not of appropriate length.
+	 *
+	 * \warning Captured scalars will be local to the user process executing the
+	 *          lambda. To retrieve the global dot product, an allreduce must
+	 *          explicitly be called.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
+	template<
+		typename Func, typename DataType,
+		typename RIT, typename CIT, typename NIT,
+		Backend implementation = config::default_backend,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, implementation, RIT, CIT, NIT > &A,
+		Args...
+	) {
+#ifdef _DEBUG
+		std::cerr << "Selected backend does not implement grb::eWiseLambda (matrices)\n";
+#endif
+#ifndef NDEBUG
+		const bool selected_backend_does_not_support_matrix_eWiseLamba = false;
+		assert( selected_backend_does_not_support_matrix_eWiseLamba );
+#endif
+		(void) f;
+		(void) A;
+		return UNSUPPORTED;
+	}
+
+	 // default (non-)implementations follow:
+
+	/**
+	 * Right-handed in-place masked sparse matrix--vector multiplication,
+	 * \f$ u = u + Av \f$, over a given semiring.
+	 *
+	 * See the documentation of #grb::mxv for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -247,7 +482,8 @@ namespace grb {
 		const Vector< InputType3, implementation, Coords > &mask,
 		const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
 		const Vector< InputType1, implementation, Coords > &v,
-		const Ring &ring,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
 		typename std::enable_if<
 			grb::is_semiring< Ring >::value,
 		void >::type * = nullptr
@@ -259,20 +495,28 @@ namespace grb {
 		const bool backend_does_not_support_output_masked_mxv = false;
 		assert( backend_does_not_support_output_masked_mxv );
 #endif
-		(void)u;
-		(void)mask;
-		(void)A;
-		(void)v;
-		(void)ring;
+		(void) u;
+		(void) mask;
+		(void) A;
+		(void) v;
+		(void) ring;
 		return UNSUPPORTED;
 	}
 
 	/**
-	 * A short-hand for an unmasked #grb::mxv.
+	 * Right-handed in-place sparse matrix--vector multiplication,
+	 * \f$ u = u + Av \f$, over a given semiring.
 	 *
-	 * @see grb::mxv for the full documentation.
+	 * See the documentation of #grb::mxv for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
-	template< Descriptor descr = descriptors::no_operation,
+	template<
+		Descriptor descr = descriptors::no_operation,
 		class Ring,
 		typename IOType, typename InputType1, typename InputType2,
 		typename Coords, typename RIT, typename CIT, typename NIT,
@@ -294,24 +538,24 @@ namespace grb {
 		const bool backend_does_not_support_mxv = false;
 		assert( backend_does_not_support_mxv );
 #endif
-		(void)u;
-		(void)A;
-		(void)v;
-		(void)ring;
+		(void) u;
+		(void) A;
+		(void) v;
+		(void) ring;
 		return UNSUPPORTED;
 	}
 
 	/**
-	 * Left-handed sparse matrix times vector multiplication, \f$ u = vA \f$.
+	 * Left-handed in-place masked sparse matrix--vector multiplication,
+	 * \f$ u = u + vA \f$, over a given semiring.
 	 *
-	 * If \a descr does not have #grb::descriptors::transpose_matrix defined, the
-	 * semantics and performance semantics of this function are exactly that of
-	 * grb::mxv with the #grb::descriptors::transpose_matrix set.
-	 * In the other case, the functional and performance semantics of this function
-	 * are exactly that of grb::mxv without the #grb::descriptors::transpose_matrix
-	 * set.
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
 	 *
-	 * @see grb::mxv for the full documentation.
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -326,7 +570,8 @@ namespace grb {
 		const Vector< InputType3, implementation, Coords > &mask,
 		const Vector< InputType1, implementation, Coords > &v,
 		const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
-		const Ring &ring,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
 		typename std::enable_if<
 			grb::is_semiring< Ring >::value, void
 		>::type * = nullptr
@@ -338,18 +583,26 @@ namespace grb {
 		const bool selected_backend_does_not_support_output_masked_vxm = false;
 		assert( selected_backend_does_not_support_output_masked_vxm );
 #endif
-		(void)u;
-		(void)mask;
-		(void)v;
-		(void)A;
-		(void)ring;
+		(void) u;
+		(void) mask;
+		(void) v;
+		(void) A;
+		(void) ring;
+		(void) phase;
 		return UNSUPPORTED;
 	}
 
 	/**
-	 * A short-hand for an unmasked grb::vxm.
+	 * Left-handed in-place sparse matrix--vector multiplication,
+	 * \f$ u = u + vA \f$, over a given semiring.
+	 *
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * @see grb::vxm for the full documentation.
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -362,7 +615,8 @@ namespace grb {
 		Vector< IOType, implementation, Coords > &u,
 		const Vector< InputType1, implementation, Coords > &v,
 		const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
-		const Ring &ring,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
 		typename std::enable_if<
 			grb::is_semiring< Ring >::value, void
 		>::type * = nullptr
@@ -374,14 +628,26 @@ namespace grb {
 		const bool selected_backend_does_not_support_vxm = false;
 		assert( selected_backend_does_not_support_vxm );
 #endif
-		(void)u;
-		(void)v;
-		(void)A;
-		(void)ring;
+		(void) u;
+		(void) v;
+		(void) A;
+		(void) ring;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Left-handed in-place doubly-masked sparse matrix--vector multiplication,
+	 * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -398,6 +664,7 @@ namespace grb {
 		const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
 		const AdditiveMonoid &add = AdditiveMonoid(),
 		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -416,17 +683,29 @@ namespace grb {
 		const bool selected_backed_does_not_support_doubly_masked_vxm = false;
 		assert( selected_backed_does_not_support_doubly_masked_vxm );
 #endif
-		(void)u;
-		(void)mask;
-		(void)v;
-		(void)v_mask;
-		(void)A;
-		(void)add;
-		(void)mul;
+		(void) u;
+		(void) mask;
+		(void) v;
+		(void) v_mask;
+		(void) A;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Right-handed in-place doubly-masked sparse matrix--vector multiplication,
+	 * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::mxv for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -443,6 +722,7 @@ namespace grb {
 		const Vector< InputType4, backend, Coords > &v_mask,
 		const AdditiveMonoid &add = AdditiveMonoid(),
 		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -461,17 +741,29 @@ namespace grb {
 		const bool selected_backed_does_not_support_doubly_masked_mxv = false;
 		assert( selected_backed_does_not_support_doubly_masked_mxv );
 #endif
-		(void)u;
-		(void)mask;
-		(void)A;
-		(void)v;
-		(void)v_mask;
-		(void)add;
-		(void)mul;
+		(void) u;
+		(void) mask;
+		(void) A;
+		(void) v;
+		(void) v_mask;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Right-handed in-place masked sparse matrix--vector multiplication,
+	 * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::mxv for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -487,6 +779,7 @@ namespace grb {
 		const Vector< InputType1, backend, Coords > &v,
 		const AdditiveMonoid & add = AdditiveMonoid(),
 		const MultiplicativeOperator & mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -505,16 +798,28 @@ namespace grb {
 		const bool selected_backed_does_not_support_masked_monop_mxv = false;
 		assert( selected_backed_does_not_support_masked_monop_mxv );
 #endif
-		(void)u;
-		(void)mask;
-		(void)A;
-		(void)v;
-		(void)add;
-		(void)mul;
+		(void) u;
+		(void) mask;
+		(void) A;
+		(void) v;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Left-handed in-place sparse matrix--vector multiplication,
+	 * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -528,6 +833,7 @@ namespace grb {
 		const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
 		const AdditiveMonoid &add = AdditiveMonoid(),
 		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -545,15 +851,27 @@ namespace grb {
 		const bool selected_backed_does_not_support_monop_vxm = false;
 		assert( selected_backed_does_not_support_monop_vxm );
 #endif
-		(void)u;
-		(void)v;
-		(void)A;
-		(void)add;
-		(void)mul;
+		(void) u;
+		(void) v;
+		(void) A;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Left-handed in-place masked sparse matrix--vector multiplication,
+	 * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -569,6 +887,7 @@ namespace grb {
 		const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
 		const AdditiveMonoid &add = AdditiveMonoid(),
 		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -585,16 +904,28 @@ namespace grb {
 		const bool selected_backed_does_not_support_masked_monop_vxm = false;
 		assert( selected_backed_does_not_support_masked_monop_vxm );
 #endif
-		(void)u;
-		(void)mask;
-		(void)v;
-		(void)A;
-		(void)add;
-		(void)mul;
+		(void) u;
+		(void) mask;
+		(void) v;
+		(void) A;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
-	/** TODO documentation */
+	/**
+	 * Right-handed in-place sparse matrix--vector multiplication,
+	 * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+	 * binary operator acting as multiplication.
+	 *
+	 * See the documentation of #grb::vxm for the full specification of this
+	 * function.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class AdditiveMonoid, class MultiplicativeOperator,
@@ -608,6 +939,7 @@ namespace grb {
 		const Vector< InputType1, backend, Coords > &v,
 		const AdditiveMonoid &add = AdditiveMonoid(),
 		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			grb::is_monoid< AdditiveMonoid >::value &&
 			grb::is_operator< MultiplicativeOperator >::value &&
@@ -624,125 +956,11 @@ namespace grb {
 		const bool selected_backed_does_not_support_monop_mxv = false;
 		assert( selected_backed_does_not_support_monop_mxv );
 #endif
-		(void)u;
-		(void)A;
-		(void)v;
-		(void)add;
-		(void)mul;
-		return UNSUPPORTED;
-	}
-
-	/**
-	 * Executes an arbitrary element-wise user-defined function \a f on all
-	 * nonzero elements of a given matrix \a A.
-	 *
-	 * The user-defined function is passed as a lambda which can capture whatever
-	 * the user would like, including one or multiple grb::Vector instances, or
-	 * multiple scalars. When capturing vectors, these should also be passed as a
-	 * additional arguments to this functions so to make sure those vectors are
-	 * synchronised for access on all row- and column- indices corresponding to
-	 * locally stored nonzeroes of \a A.
-	 *
-	 * Only the elements of a single matrix may be iterated upon.
-	 *
-	 * \note Rationale: while it is reasonable to expect an implementation be able
-	 *       to synchronise vector elements, it may be unreasonable to expect two
-	 *       different matrices can be jointly accessed via arbitrary lambda
-	 *       functions.
-	 *
-	 * \warning The lambda shall only be executed on the data local to the user
-	 *          process calling this function! This is different from the various
-	 *          fold functions, or grb::dot, in that the semantics of those
-	 *          functions always result in globally synchronised result. To
-	 *          achieve the same effect with user-defined lambdas, the users
-	 *          should manually prescribe how to combine the local results into
-	 *          global ones, for instance, by subsequent calls to
-	 *          grb::collectives.
-	 *
-	 * \note This is an addition to the GraphBLAS. It is alike user-defined
-	 *       operators, monoids, and semirings, except it allows execution on
-	 *       arbitrarily many inputs and arbitrarily many outputs.
-	 *
-	 * @tparam Func     the user-defined lambda function type.
-	 * @tparam DataType the type of the user-supplied matrix.
-	 * @tparam backend  the backend type of the user-supplied vector example.
-	 *
-	 * @param[in] f The user-supplied lambda. This lambda should only capture
-	 *              and reference vectors of the same length as either the row or
-	 *              column dimension length of \a A. The lambda function should
-	 *              prescribe the operations required to execute on a given
-	 *              reference to a matrix nonzero of \a A (of type \a DataType) at
-	 *              a given index \f$ (i,j) \f$. Captured GraphBLAS vectors can
-	 *              access corresponding elements via Vector::operator[] or
-	 *              Vector::operator(). It is illegal to access any element not at
-	 *              position \a i if the vector length is equal to the row
-	 *              dimension. It is illegal to access any element not at position
-	 *              \a j if the vector length is equal to the column dimension.
-	 *              Vectors of length neither equal to the column or row dimension
-	 *              may \em not be referenced or undefined behaviour will occur. The
-	 *              reference to the matrix nonzero is non \a const and may thus be
-	 *              modified. New nonzeroes may \em not be added through this lambda
-	 *              functionality. The function \a f must have the following
-	 *              signature:
-	 *              <code>(DataType &nz, const size_t i, const size_t j)</code>.
-	 *              The GraphBLAS implementation decides which nonzeroes of \a A are
-	 *              dereferenced, and thus also decides the values \a i and \a j the
-	 *              user function is evaluated on.
-	 * @param[in] A The matrix the lambda is to access the elements of.
-	 * @param[in] args All vectors the lambda is to access elements of. Must be of
-	 *                 the same length as \a nrows(A) or \a ncols(A). If this
-	 *                 constraint is violated, grb::MISMATCH shall be returned. If
-	 *                 the vector length equals \a nrows(A), the vector shall be
-	 *                 synchronized for access on \a i. If the vector length equals
-	 *                 \a ncols(A), the vector shall be synchronized for access on
-	 *                 \a j. If \a A is square, the vectors will be synchronised for
-	 *                 access on both \a x and \a y. <em>This is a variadic argument
-	 *                 and can contain any number of containers of type grb::Vector,
-	 *                 passed as though they were separate arguments.</em>
-	 *
-	 * \warning Using a grb::Vector inside a lambda passed to this function while
-	 *          not passing that same vector into \a args, will result in undefined
-	 *          behaviour.
-	 *
-	 * \warning Due to the constraints on \a f described above, it is illegal to
-	 *          capture some vector \a y and have the following line in the body
-	 *          of \a f: <code>x[i] += x[i+1]</code>. Vectors can only be
-	 *          dereferenced at position \a i and \a i alone, and similarly for
-	 *          access using \a j. For square matrices, however, the following
-	 *          code in the body is accepted, however: <code>x[i] += x[j]</code>.
-	 *
-	 * @return grb::SUCCESS  When the lambda is successfully executed.
-	 * @return grb::MISMATCH When two or more vectors passed to \a args are not of
-	 *                       appropriate length.
-	 *
-	 * \warning Captured scalars will be local to the user process executing the
-	 *          lambda. To retrieve the global dot product, an allreduce must
-	 *          explicitly be called.
-	 *
-	 * @see Vector::operator[]()
-	 * @see Vector::operator()()
-	 * @see Vector::lambda_reference
-	 */
-	template<
-		typename Func, typename DataType,
-		typename RIT, typename CIT, typename NIT,
-		Backend implementation = config::default_backend,
-		typename... Args
-	>
-	RC eWiseLambda(
-		const Func f,
-		const Matrix< DataType, implementation, RIT, CIT, NIT > &A,
-		Args... /*args*/
-	) {
-#ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::eWiseLambda (matrices)\n";
-#endif
-#ifndef NDEBUG
-		const bool selected_backend_does_not_support_matrix_eWiseLamba = false;
-		assert( selected_backend_does_not_support_matrix_eWiseLamba );
-#endif
-		(void)f;
-		(void)A;
+		(void) u;
+		(void) A;
+		(void) v;
+		(void) add;
+		(void) mul;
 		return UNSUPPORTED;
 	}
 
diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 02965eee4..2aab1be2a 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-3 API
+ *
  * @author A. N. Yzelman
  */
 
@@ -28,10 +32,12 @@
 #include "matrix.hpp"
 #include "vector.hpp"
 
+
 namespace grb {
 
 	/**
-	 * \defgroup BLAS3 The Level-3 Basic Linear Algebra Subroutines (BLAS)
+	 * \defgroup BLAS3 Level-3 Primitives
+	 * \ingroup GraphBLAS
 	 *
 	 * A collection of functions that allow GraphBLAS semirings to work on
 	 * one or more two-dimensional sparse containers (i.e, sparse matrices).
@@ -40,9 +46,11 @@ namespace grb {
 	 */
 
 	/**
-	 * Unmaked sparse matrix--sparse matrix multiplication (SpMSpM).
+	 * Unmasked and in-place sparse matrix--sparse matrix multiplication (SpMSpM),
+	 * \f$ C += A+B \f$.
 	 *
 	 * @tparam descr      The descriptors under which to perform the computation.
+	 *                    Optional; default is #grb::descriptors::no_operation.
 	 * @tparam OutputType The type of elements in the output matrix.
 	 * @tparam InputType1 The type of elements in the left-hand side input
 	 *                    matrix.
@@ -50,23 +58,31 @@ namespace grb {
 	 *                    matrix.
 	 * @tparam Semiring   The semiring under which to perform the
 	 *                    multiplication.
-	 * @tparam Backend    The backend that should perform the computation.
-	 *
-	 * @returns SUCCESS If the computation completed as intended.
-	 * @returns FAILED  If the call was not not preceded by one to
-	 *                  #grb::resize( C, A, B ); \em and the current capacity of
-	 *                  \a C was insufficient to store the multiplication of \a A
-	 *                  and \a B. The contents of \a C shall be undefined (which
-	 *                  is why #FAILED is returned instead of #ILLEGAL-- this
-	 *                  error has side effects).
-	 *
-	 * @param[out] C The output matrix \f$ C = AB \f$ when the function returns
-	 *               #SUCCESS.
-	 * @param[in]  A The left-hand side input matrix \f$ A \f$.
-	 * @param[in]  B The left-hand side input matrix \f$ B \f$.
-	 *
-	 * @param[in] ring (Optional.) The semiring under which the computation should
-	 *                             proceed.
+	 *
+	 * @param[in,out] C The matrix into which the multiplication \f$ AB \f$ is
+	 *                  accumulated.
+	 * @param[in]   A   The left-hand side input matrix \f$ A \f$.
+	 * @param[in]   B   The left-hand side input matrix \f$ B \f$.
+	 *
+	 * @param[in] ring  The semiring under which the computation should
+	 *                  proceed.
+	 * @param[in] phase The #grb::Phase the primitive should be executed with. This
+	 *                  argument is optional; its default is #grb::EXECUTE.
+	 *
+	 * @return #grb::SUCCESS  If the computation completed as intended.
+	 * @return #grb::FAILED   If the capacity of \a C was insufficient to store the
+	 *                        output of multiplying \a A and \a B. If this code is
+	 *                        returned, \a C on output appears cleared.
+	 * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE and an out-of-error
+	 *                        condition arose while resizing \a C.
+	 *
+	 * \note This specification does not account for #grb::TRY as that phase is
+	 *       still experimental. See its documentation for details.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -83,63 +99,87 @@ namespace grb {
 		const Phase &phase = EXECUTE
 	) {
 #ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::mxm (semiring version)\n";
+		std::cerr << "Selected backend does not implement grb::mxm "
+			<< "(semiring version)\n";
 #endif
 #ifndef NDEBUG
 		const bool selected_backend_does_not_support_mxm = false;
 		assert( selected_backend_does_not_support_mxm );
 #endif
-		(void)C;
-		(void)A;
-		(void)B;
-		(void)ring;
-		(void)phase;
+		(void) C;
+		(void) A;
+		(void) B;
+		(void) ring;
+		(void) phase;
 		// this is the generic stub implementation
 		return UNSUPPORTED;
 	}
 
 	/**
-	 * Interprets three vectors x, y, and z as a series of row coordinates,
-	 * column coordinates, and nonzeroes, respectively, and stores the thus
-	 * defined nonzeroes in a given output matrix A.
+	 * The #grb::zip merges three vectors into a matrix.
+	 *
+	 * Interprets three input vectors \a x, \a y, and \a z as a series of row
+	 * coordinates, column coordinates, and nonzeroes, respectively. The
+	 * thus-defined nonzeroes of a matrix are then stored in a given output
+	 * matrix \a A.
+	 *
+	 * The vectors \a x, \a y, and \a z must have equal length, as well as the same
+	 * number of nonzeroes. If the vectors are sparse, all vectors must have the
+	 * same sparsity structure.
 	 *
-	 * If this function does not return SUCCESS, A will have been cleared.
+	 * \note A variant of this function only takes \a x and \a y, and has that the
+	 *       output matrix \a A has <tt>void</tt> element types.
 	 *
-	 * A must have been pre-allocated to store the nonzero pattern the three
-	 * given vectors x, y, and z encode, or ILLEGAL shall be returned.
+	 * If this function does not return #grb::SUCCESS, the output \ a A will have
+	 * no contents on function exit.
 	 *
-	 * \note A call to this function hence must be preceded by a successful
-	 *       call to grb::resize( matrix, nnz );
+	 * The matrix \a A must have been pre-allocated to store the nonzero pattern
+	 * that the three given vectors \a x, \a y, and \a z encode, or otherwise this
+	 * function returns #grb::ILLEGAL.
 	 *
-	 * @param[out] A The output matrix
-	 * @param[in]  x A vector of row indices.
-	 * @param[in]  y A vector of column indices.
-	 * @param[in]  z A vector of nonzero values.
+	 * \note To ensure that the capacity of \a A is sufficient, a succesful call to
+	 *       #grb::resize with #grb::nnz of \a x suffices. Alternatively, and with
+	 *       the same effect, a succesful call to this function with \a phase equal
+	 *       to #grb::RESIZE instead of #grb::SUCCESS suffices also.
 	 *
-	 * If x, y, and z are sparse, they must have the exact same sparsity
-	 * structure.
+	 * @param[out]  A   The output matrix.
+	 * @param[in]   x   A vector of row indices.
+	 * @param[in]   y   A vector of column indices.
+	 * @param[in]   z   A vector of nonzero values.
+	 * @param[in] phase The #grb::Phase in which the primitive is to proceed.
+	 *                  Optional; the default is #grb::EXECUTE.
 	 *
+	 * @return #grb::SUCCESS  If \a A was constructed successfully.
+	 * @return #grb::MISMATCH If \a y or \a z does not match the size of \a x.
+	 * @return #grb::ILLEGAL  If \a y or \a z do not have the same number of
+	 *                        nonzeroes as \a x.
+	 * @return #grb::ILLEGAL  If \a y or \a z has a different sparsity pattern from
+	 *                        \a x.
+	 * @return #grb::FAILED   If the capacity of \a A was insufficient to store the
+	 *                        given sparsity pattern and \a phase is #grb::EXECUTE.
+	 * @return #grb::OUTOFMEM If the \a phase is #grb::RESIZE and \a A could not be
+	 *                        resized to have sufficient capacity to complete this
+	 *                        function due to out-of-memory conditions.
+	 *
+	 * \parblock
 	 * \par Descriptors
 	 *
 	 * None allowed.
+	 * \endparblock
 	 *
-	 * @returns SUCCESS  If A was constructed successfully.
-	 * @returns MISMATCH If y or z does not match the size of x.
-	 * @returns ILLEGAL  If y or z do not have the same number of nonzeroes
-	 *                   as x.
-	 * @returns ILLEGAL  If y or z has a different sparsity pattern from x.
-	 * @returns ILLEGAL  If the capacity of A was insufficient to store the
-	 *                   given sparsity pattern.
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * @see grb::resize
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
-		typename OutputType, typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename InputType3, typename RIT, typename CIT, typename NIT,
 		Backend backend, typename Coords
 	>
 	RC zip(
-		Matrix< OutputType, backend > &A,
+		Matrix< OutputType, backend, RIT, CIT, NIT > &A,
 		const Vector< InputType1, backend, Coords > &x,
 		const Vector< InputType2, backend, Coords > &y,
 		const Vector< InputType3, backend, Coords > &z,
@@ -150,7 +190,8 @@ namespace grb {
 		(void) z;
 		(void) phase;
 #ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::zip (vectors into matrices, non-void)\n";
+		std::cerr << "Selected backend does not implement grb::zip (vectors into "
+			<< "matrices, non-void)\n";
 #endif
 #ifndef NDEBUG
 		const bool selected_backend_does_not_support_zip_from_vectors_to_matrix
@@ -162,16 +203,25 @@ namespace grb {
 	}
 
 	/**
-	 * Specialisation of grb::zip for void output matrices.
+	 * Merges two vectors into a <tt>void</tt> matrix.
+	 *
+	 * This is a specialisation of #grb::zip for pattern matrices. The two input
+	 * vectors \a x and \a y represent coordinates of nonzeroes to be stored in
+	 * \a A.
+	 *
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename InputType1, typename InputType2, typename InputType3,
-		Backend backend,
-		typename Coords
+		typename RIT, typename CIT, typename NIT,
+		Backend backend, typename Coords
 	>
 	RC zip(
-		Matrix< void, backend > &A,
+		Matrix< void, backend, RIT, CIT, NIT > &A,
 		const Vector< InputType1, backend, Coords > &x,
 		const Vector< InputType2, backend, Coords > &y,
 		const Phase &phase = EXECUTE
@@ -180,7 +230,8 @@ namespace grb {
 		(void) y;
 		(void) phase;
 #ifdef _DEBUG
-		std::cerr << "Selected backend does not implement grb::zip (vectors into matrices, void)\n";
+		std::cerr << "Selected backend does not implement grb::zip (vectors into "
+			<< "matrices, void)\n";
 #endif
 #ifndef NDEBUG
 		const bool selected_backend_does_not_support_zip_from_vectors_to_void_matrix
diff --git a/include/graphblas/base/collectives.hpp b/include/graphblas/base/collectives.hpp
index d9af31523..a77638fed 100644
--- a/include/graphblas/base/collectives.hpp
+++ b/include/graphblas/base/collectives.hpp
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies some basic collectives which may be used within a multi-process
+ * ALP program.
+ *
  * @author A. N. Yzelman & J. M. Nash
  * @date 20th of February, 2017
  */
@@ -27,6 +32,7 @@
 #include <graphblas/descriptors.hpp>
 #include <graphblas/rc.hpp>
 
+
 namespace grb {
 
 	/**
@@ -39,219 +45,229 @@ namespace grb {
 	template< enum Backend implementation >
 	class collectives {
 
-	private:
-		/** Disallow creating an instance. */
-		collectives() {}
+		private:
 
-	public:
-		/**
-		 * Schedules an allreduce operation of a single object of type IOType per
-		 * process. The allreduce shall be complete by the end of the call. This is a
-		 * collective graphBLAS operation. After the collective call finishes, each
-		 * user process will locally have available the allreduced value.
-		 *
-		 * Since this is a collective call, there are \a P values \a inout spread over
-		 * all user processes. Let these values be denoted by \f$ x_s \f$, with
-		 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
-		 * argument \a inout on input at the user process with ID \a s. Let
-		 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
-		 * bijection, some unknown permutation of the process ID. This permutation is
-		 * must be fixed for any given combination of GraphBLAS implementation and value
-		 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
-		 *
-		 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
-		 * exact same result to \a inout at each of the \a P user processes.
-		 *
-		 * In summary, this means 1) this operation is coherent across all processes and
-		 * produces bit-wise equivalent output on all user processes, and 2) the result
-		 * is reproducible across different runs using the same input and \a P. Yet it
-		 * does \em not mean that the order of addition is fixed.
-		 *
-		 * Since each user process supplies but one value, there is no difference
-		 * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
-		 * and grb::reducer).
-		 *
-		 * @tparam descr    The GraphBLAS descriptor.
-		 *                  Default is grb::descriptors::no_operation.
-		 * @tparam Operator Which operator to use for reduction.
-		 * @tparam IOType   The type of the to-be reduced value.
-		 *
-		 * @param[in,out] inout On input:  the value at the calling process to be
-		 *                      reduced. On output: the reduced value.
-		 * @param[in]      op   The associative operator to reduce by.
-		 *
-		 * \note If \op is commutative, the implementation free to employ a different
-		 *       allreduce algorithm, as long as it is documented well enough so that
-		 *       its cost can be quantified.
-		 *
-		 * @returns grb::SUCCESS When the operation succeeds as planned.
-		 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
-		 *                       this error code is returned, the library enters an
-		 *                       undefined state.
-		 *
-		 * \parblock
-		 * \par Valid descriptors:
-		 * -# grb::descriptors::no_operation
-		 * -# grb::descriptors::no_casting
-		 *  Any other descriptors will be ignored.
-		 *  \endparblock
-		 *
-		 * \parblock
-		 * \par Performance semantics:
-		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ N*Operator \f$ ;
-		 * -# transferred bytes: \f$ N \f$ ;
-		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
-		 * \endparblock
-		 */
-		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
-		static RC allreduce( IOType & inout, const Operator op = Operator() ) {
-			(void)inout;
-			(void)op;
-			return PANIC;
-		}
+			/** Disallow creating an instance. */
+			collectives() {}
 
-		/**
-		 * Schedules a reduce operation of a single object of type IOType per process.
-		 * The reduce shall be complete by the end of the call. This is a collective
-		 * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
-		 *
-		 * Since this is a collective call, there are \a P values \a inout spread over
-		 * all user processes. Let these values be denoted by \f$ x_s \f$, with
-		 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
-		 * argument \a inout on input at the user process with ID \a s. Let
-		 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
-		 * bijection, some unknown permutation of the process ID. This permutation is
-		 * must be fixed for any given combination of GraphBLAS implementation and value
-		 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
-		 *
-		 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
-		 * result to \a inout at the user process with ID \a root.
-		 *
-		 * In summary, this the result is reproducible across different runs using the
-		 * same input and \a P. Yet it does \em not mean that the order of addition is
-		 * fixed.
-		 *
-		 * Since each user process supplies but one value, there is no difference
-		 * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
-		 * and grb::reducer).
-		 *
-		 * @tparam descr    The GraphBLAS descriptor.
-		 *                  Default is grb::descriptors::no_operation.
-		 * @tparam Operator Which operator to use for reduction.
-		 * @tparam IOType   The type of the to-be reduced value.
-		 *
-		 * @param[in,out] inout On input: the value at the calling process to be
-		 *                      reduced. On output at process \a root: the reduced value.
-		 *                      On output as non-root processes: same value as on input.
-		 * @param[in]       op  The associative operator to reduce by.
-		 * @param[in]      root Which process should hold the reduced value. This
-		 *                      number must be larger or equal to zero, and must be
-		 *                      strictly smaller than the number of user processes
-		 *                      \a P.
-		 *
-		 * @return SUCCESS When the function completes successfully.
-		 * @return ILLEGAL When root is larger or equal than \a P. When this code is
-		 *                 returned, the state of the GraphBLAS shall be as though
-		 *                 this call was never made.
-		 * @return PANIC   When an unmitigable error within the GraphBLAS occurs.
-		 *                 Upon returning this error, the GraphBLAS enters an
-		 *                 undefined state.
-		 *
-		 * \note If \op is commutative, the implementation free to employ a different
-		 *       allreduce algorithm, as long as it is documented well enough so that
-		 *       its cost can be quantified.
-		 *
-		 * \parblock
-		 * \par Performance semantics:
-		 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ N*Operator \f$ ;
-		 * -# transferred bytes: \f$ N \f$ ;
-		 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
-		 * \endparblock
-		 */
-		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
-		static RC reduce( IOType & inout, const size_t root = 0, const Operator op = Operator() ) {
-			(void)inout;
-			(void)op;
-			(void)root;
-			return PANIC;
-		}
 
-		/**
-		 * Schedules a broadcast operation of a single object of type IOType per
-		 * process. The broadcast shall be complete by the end of the call. This is
-		 * a collective graphBLAS operation. The BSP costs are as for the PlatformBSP
-		 * #broadcast.
-		 *
-		 * @tparam IOType   The type of the to-be broadcast value.
-		 *
-		 * @param[in,out] inout On input at process \a root: the value to be
-		 *                      broadcast.
-		 *                      On input at non-root processes: initial values are
-		 *                      ignored.
-		 *                      On output at process \a root: the input value remains
-		 *                      unchanged.
-		 *                      On output at non-root processes: the same value held
-		 *                      at process ID \a root.
-		 * @param[in]      root The user process which is to send out the given input
-		 *                      value \a inout so that it becomes available at all
-		 *                      \a P user processes. This value must be larger or
-		 *                      equal to zero and must be smaller than the total
-		 *                      number of user processes \a P.
-		 *
-		 * @return SUCCESS On the successful completion of this function.
-		 * @return ILLEGAL When \a root is larger or equal to \a P. If this code is
-		 *                 returned, it shall be as though the call to this function
-		 *                 had never occurred.
-		 * return PANIC    When the function fails and the library enters an
-		 *                 undefined state.
-		 *
-		 * \parblock
-		 * \par Performance semantics: serial
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ NP \f$ ;
-		 * -# BSP cost: \f$ NPg + l \f$;
-		 * \endparblock
-		 *
-		 * \par Performance semantics: two phase
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ 2N \f$ ;
-		 * -# BSP cost: \f$ 2(Ng + l) \f$;
-		 * \endparblock
-		 *
-		 * \par Performance semantics: two level tree
-		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
-		 * -# local work: \f$ 0 \f$ ;
-		 * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
-		 * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
-		 * \endparblock
-		 */
-		template< typename IOType >
-		static RC broadcast( IOType &inout, const size_t root = 0 ) {
-			(void)inout;
-			(void)root;
-			return PANIC;
-		}
+		public:
 
-		/**
-		 * Broadcast on an array of \a IOType.
-		 *
-		 * The above documentation applies with \a size times <tt>sizeof(IOType)</tt>
-		 * substituted in.
-		 */
-		template< Descriptor descr = descriptors::no_operation, typename IOType >
-		static RC broadcast( IOType * inout, const size_t size, const size_t root = 0 ) {
-			(void)inout;
-			(void)size;
-			(void)root;
-			return PANIC;
-		}
+			/**
+			 * Schedules an allreduce operation of a single object of type IOType per
+			 * process. The allreduce shall be complete by the end of the call. This is a
+			 * collective graphBLAS operation. After the collective call finishes, each
+			 * user process will locally have available the allreduced value.
+			 *
+			 * Since this is a collective call, there are \a P values \a inout spread over
+			 * all user processes. Let these values be denoted by \f$ x_s \f$, with
+			 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+			 * argument \a inout on input at the user process with ID \a s. Let
+			 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+			 * bijection, some unknown permutation of the process ID. This permutation is
+			 * must be fixed for any given combination of GraphBLAS implementation and value
+			 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+			 *
+			 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+			 * exact same result to \a inout at each of the \a P user processes.
+			 *
+			 * In summary, this means 1) this operation is coherent across all processes and
+			 * produces bit-wise equivalent output on all user processes, and 2) the result
+			 * is reproducible across different runs using the same input and \a P. Yet it
+			 * does \em not mean that the order of addition is fixed.
+			 *
+			 * Since each user process supplies but one value, there is no difference
+			 * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
+			 * and grb::reducer).
+			 *
+			 * @tparam descr    The GraphBLAS descriptor.
+			 *                  Default is grb::descriptors::no_operation.
+			 * @tparam Operator Which operator to use for reduction.
+			 * @tparam IOType   The type of the to-be reduced value.
+			 *
+			 * @param[in,out] inout On input:  the value at the calling process to be
+			 *                      reduced. On output: the reduced value.
+			 * @param[in]      op   The associative operator to reduce by.
+			 *
+			 * \note If \a op is commutative, the implementation free to employ a
+			 *       different allreduce algorithm, as long as it is documented well
+			 *       enough so that its cost can be quantified.
+			 *
+			 * @returns grb::SUCCESS When the operation succeeds as planned.
+			 * @returns grb::PANIC   When the communication layer unexpectedly fails. When
+			 *                       this error code is returned, the library enters an
+			 *                       undefined state.
+			 *
+			 * \parblock
+			 * \par Valid descriptors:
+			 * -# grb::descriptors::no_operation
+			 * -# grb::descriptors::no_casting
+			 *  Any other descriptors will be ignored.
+			 *  \endparblock
+			 *
+			 * \parblock
+			 * \par Performance semantics:
+			 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ N*Operator \f$ ;
+			 * -# transferred bytes: \f$ N \f$ ;
+			 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+			 * \endparblock
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename Operator,
+				typename IOType
+			>
+			static RC allreduce( IOType &inout, const Operator op = Operator() ) {
+				(void) inout;
+				(void)op;
+				return PANIC;
+			}
+
+			/**
+			 * Schedules a reduce operation of a single object of type IOType per process.
+			 * The reduce shall be complete by the end of the call. This is a collective
+			 * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
+			 *
+			 * Since this is a collective call, there are \a P values \a inout spread over
+			 * all user processes. Let these values be denoted by \f$ x_s \f$, with
+			 * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+			 * argument \a inout on input at the user process with ID \a s. Let
+			 * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+			 * bijection, some unknown permutation of the process ID. This permutation is
+			 * must be fixed for any given combination of GraphBLAS implementation and value
+			 * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+			 *
+			 * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+			 * result to \a inout at the user process with ID \a root.
+			 *
+			 * In summary, this the result is reproducible across different runs using the
+			 * same input and \a P. Yet it does \em not mean that the order of addition is
+			 * fixed.
+			 *
+			 * Since each user process supplies but one value, there is no difference
+			 * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
+			 * and grb::reducer).
+			 *
+			 * @tparam descr    The GraphBLAS descriptor.
+			 *                  Default is grb::descriptors::no_operation.
+			 * @tparam Operator Which operator to use for reduction.
+			 * @tparam IOType   The type of the to-be reduced value.
+			 *
+			 * @param[in,out] inout On input: the value at the calling process to be
+			 *                      reduced. On output at process \a root: the reduced value.
+			 *                      On output as non-root processes: same value as on input.
+			 * @param[in]       op  The associative operator to reduce by.
+			 * @param[in]      root Which process should hold the reduced value. This
+			 *                      number must be larger or equal to zero, and must be
+			 *                      strictly smaller than the number of user processes
+			 *                      \a P.
+			 *
+			 * @return SUCCESS When the function completes successfully.
+			 * @return ILLEGAL When root is larger or equal than \a P. When this code is
+			 *                 returned, the state of the GraphBLAS shall be as though
+			 *                 this call was never made.
+			 * @return PANIC   When an unmitigable error within the GraphBLAS occurs.
+			 *                 Upon returning this error, the GraphBLAS enters an
+			 *                 undefined state.
+			 *
+			 * \note If \a op is commutative, the implementation free to employ a
+			 *       different allreduce algorithm, as long as the performance semantics
+			 *       are documented so that its cost can be quantified.
+			 *
+			 * \parblock
+			 * \par Performance semantics:
+			 * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+			 * -# local work: \f$ N*Operator \f$ ;
+			 * -# transferred bytes: \f$ N \f$ ;
+			 * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+			 * \endparblock
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename Operator,
+				typename IOType
+			>
+			static RC reduce(
+				IOType &inout,
+				const size_t root = 0,
+				const Operator op = Operator()
+			) {
+				(void) inout;
+				(void) op;
+				(void) root;
+				return PANIC;
+			}
+
+			/**
+			 * Schedules a broadcast operation of a single object of type IOType per
+			 * process. The broadcast shall be complete by the end of the call. This is
+			 * a collective graphBLAS operation. The BSP costs are as for the PlatformBSP
+			 * #broadcast.
+			 *
+			 * @tparam IOType   The type of the to-be broadcast value.
+			 *
+			 * @param[in,out] inout On input at process \a root: the value to be
+			 *                      broadcast.
+			 *                      On input at non-root processes: initial values are
+			 *                      ignored.
+			 *                      On output at process \a root: the input value remains
+			 *                      unchanged.
+			 *                      On output at non-root processes: the same value held
+			 *                      at process ID \a root.
+			 * @param[in]      root The user process which is to send out the given input
+			 *                      value \a inout so that it becomes available at all
+			 *                      \a P user processes. This value must be larger or
+			 *                      equal to zero and must be smaller than the total
+			 *                      number of user processes \a P.
+			 *
+			 * @return SUCCESS On the successful completion of this function.
+			 * @return ILLEGAL When \a root is larger or equal to \a P. If this code is
+			 *                 returned, it shall be as though the call to this function
+			 *                 had never occurred.
+			 * return PANIC    When the function fails and the library enters an
+			 *                 undefined state.
+			 *
+			 * \parblock
+			 * \par Performance semantics
+			 * Backends should define performance semantics in terms of work and data
+			 * movement, the latter both within and between user processes. Also the
+			 * number of synchronisations between user processes must be quantified.
+			 *
+			 * Backends furthermore must indicate whether system calls may occur during a
+			 * call to this primitive, indicate whether additional dynamic may be
+			 * allocated (and if so, when it is freed), and quantify the required work
+			 * space.
+			 * \endparblock
+			 */
+			template< typename IOType >
+			static RC broadcast( IOType &inout, const size_t root = 0 ) {
+				(void) inout;
+				(void) root;
+				return PANIC;
+			}
+
+			/**
+			 * Broadcast on an array of \a IOType.
+			 *
+			 * The above documentation applies with \a size times <tt>sizeof(IOType)</tt>
+			 * substituted in.
+			 */
+			template< Descriptor descr = descriptors::no_operation, typename IOType >
+			static RC broadcast(
+				IOType * inout,
+				const size_t size,
+				const size_t root = 0
+			) {
+				(void) inout;
+				(void) size;
+				(void) root;
+				return PANIC;
+			}
 
 	}; // end class ``collectives''
 
 } // end namespace grb
 
 #endif // end _H_GRB_COLL_BASE
+
diff --git a/include/graphblas/base/config.hpp b/include/graphblas/base/config.hpp
index 353b4ed90..f7796c852 100644
--- a/include/graphblas/base/config.hpp
+++ b/include/graphblas/base/config.hpp
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines both configuration parameters effective for all backends, as
+ * well as defines structured ways of passing backend-specific parameters.
+ *
  * @author A. N. Yzelman
  * @date 8th of August, 2016
  */
@@ -41,20 +46,43 @@
  #define _GRB_BACKEND reference
 #endif
 
-/**
- * The main GraphBLAS namespace.
- *
- * All GraphBLAS functions and objects are defined within.
- */
+
 namespace grb {
 
-	/** Contains compile-time configuration constants. */
+	/**
+	 * Compile-time configuration constants as well as implementation details that
+	 * are derived from such settings.
+	 */
 	namespace config {
 
-		/** The default backend to be selected for an end user. */
+		/**
+		 * \defgroup config Configuration
+		 *
+		 * This module collects all configuration settings.
+		 */
+
+		/**
+		 * \defgroup commonConfig Common configuration settings
+		 * \ingroup config
+		 *
+		 * Configuration elements contained in this group affect all backends.
+		 *
+		 * @{
+		 */
+
+		/**
+		 * \internal
+		 * The default backend to be selected for an end user.
+		 * \ingroup config
+		 * \endinternal
+		 */
 		static constexpr grb::Backend default_backend = _GRB_BACKEND;
 
-		/** The cache line size, in bytes. */
+		/**
+		 * Contains information about the target architecture cache line size.
+		 *
+		 * \ingroup config
+		 */
 		class CACHE_LINE_SIZE {
 
 			private:
@@ -68,15 +96,22 @@ namespace grb {
 			public:
 
 				/**
+				 * \internal
 				 * @return The cache line size in bytes.
 				 * @see grb::config::CACHE_LINE_SIZE::bytes
+				 * \endinternal
 				 */
 				static constexpr size_t value() {
 					return bytes;
 				}
+
 		};
 
-		/** The SIMD size, in bytes. */
+		/**
+		 * The SIMD size, in bytes.
+		 *
+		 * \ingroup config
+		 */
 		class SIMD_SIZE {
 
 			private:
@@ -90,8 +125,10 @@ namespace grb {
 			public:
 
 				/**
+				 * \internal
 				 * @return The SIMD size in bytes.
 				 * @see grb::config::SIMD_SIZE::bytes
+				 * \endinternal
 				 */
 				static constexpr size_t value() {
 					return bytes;
@@ -99,25 +136,34 @@ namespace grb {
 
 		};
 
-		/** How many elements of a given data type fit into a SIMD register. */
+		/**
+		 * \internal
+		 * How many elements of a given data type fit into a SIMD register.
+		 * \ingroup config
+		 * \endinternal
+		 */
 		template< typename T >
 		class SIMD_BLOCKSIZE {
 
 			public:
 
 				/**
+				 * \internal
 				 * Calculates the block size this operator should use.
 				 *
 				 * \warning This rounds down. If instances of T are too large, this could
 				 *          result in a zero value. See #value for a correction.
+				 * \endinternal
 				 */
 				static constexpr size_t unsafe_value() {
 					return SIMD_SIZE::value() / sizeof( T );
 				}
 
 				/**
+				 * \internal
 				 * The maximum of one and the number of elements that fit into a single
 				 * cache line.
+				 * \endinternal
 				 */
 				static constexpr size_t value() {
 					return unsafe_value() > 0 ? unsafe_value() : 1;
@@ -126,63 +172,81 @@ namespace grb {
 		};
 
 		/**
+		 * \internal
 		 * How many hardware threads the operating system exposes.
 		 *
 		 * \warning On contemporary x86-based hardware, the reported number by
 		 *          value() will include that of each hyper-thread. This number
 		 *          thus does not necessarily equal the number of cores available.
+		 *
+		 * \ingroup config
+		 * \endinternal
 		 */
 		class HARDWARE_THREADS {
 
 			public:
 
 				/**
-				 * Returns the number of online hardware threads as reported by the OS.
+				 * \internal
+				 * Returns the number of online hardware threads as reported by the
+				 * operating system.
 				 *
 				 * \warning This is a UNIX system call.
 				 *
 				 * @returns The number of hardware threads currently online. The return
 				 *          type is specified by the UNIX standard.
+				 * \endinternal
 				 */
 				static long value() {
 					return sysconf( _SC_NPROCESSORS_ONLN );
 				}
 
-			};
+		};
 
-		/** Benchmarking defaults. */
+		/**
+		 * Benchmarking default configuration parameters.
+		 *
+		 * \ingroup config
+		 */
 		class BENCHMARKING {
 
 			public:
 
-				/** The default number of inner repititions. */
+				/** @returns The default number of inner repetitions. */
 				static constexpr size_t inner() {
 					return 1;
 				}
 
-				/** The default number of outer repititions. */
+				/** @returns The default number of outer repetitions. */
 				static constexpr size_t outer() {
 					return 10;
 				}
 
 		};
 
-		/** Memory defaults. */
+		/**
+		 * Memory configuration parameters.
+		 *
+		 * \ingroup config
+		 */
 		class MEMORY {
 
 			public:
 
-				/** The private L1 data cache size, in bytes. */
+				/** @returns the private L1 data cache size, in bytes. */
 				static constexpr size_t l1_cache_size() {
 					return 32768;
 				}
 
-				/** What is considered a lot of memory, in 2-log of bytes. */
+				/**
+				 * @returns What is considered a lot of memory, in 2-log of bytes.
+				 */
 				static constexpr size_t big_memory() {
 					return 31;
 				} // 2GB
 
 				/**
+				 * \internal
 				 * The memory speed under random accesses of 8-byte words.
 				 *
 				 * @returns The requested speed in MiB/s/process.
@@ -196,12 +260,18 @@ namespace grb {
 				 *       much between architectures. Nevertheless, for best results, these
 				 *       numbers are best set to benchmarked values on the deployment
 				 *       hardware.
+				 *
+				 * @note Preliminary experiments have not resulted in a decisive gain from
+				 *       using this parameter, and hence it is currently not used by any
+				 *       backend.
+				 * \endinternal
 				 */
 				static constexpr double random_access_memspeed() {
 					return 147.298;
 				}
 
 				/**
+				 * \internal
 				 * The memory speed under a limited number of streams of uncached data.
 				 *
 				 * @returns The requested speed in MiB/s/process.
@@ -215,15 +285,22 @@ namespace grb {
 				 *       much between architectures. Nevertheless, for best results, these
 				 *       numbers are best set to benchmarked values on the deployment
 				 *       hardware.
+				 *
+				 * @note Preliminary experiments have not resulted in a decisive gain from
+				 *       using this parameter, and hence it is currently not used by any
+				 *       backend.
+				 * \endinternal
 				 */
 				static constexpr double stream_memspeed() {
 					return 1931.264;
 				}
 
 				/**
+				 * \internal
 				 * Prints memory usage info to stdout, but only for big memory allocations.
 				 *
 				 * @returns true if and only if this function printed information to stdout.
+				 * \endinternal
 				 */
 				static bool report(
 					const std::string prefix, const std::string action,
@@ -268,13 +345,82 @@ namespace grb {
 		};
 
 		/**
-		 * Configuration parameters that may depend on the backend.
+		 * Collects a series of implementation choices corresponding to some given
+		 * \a backend.
+		 *
+		 * These implementation choices are useful for \em compositional backends;
+		 * i.e., backends that rely on a nested sub-backend for functionality. To
+		 * facilitate composability, backends are required to provide the functions
+		 * specified herein.
+		 *
+		 * \note An example are the #grb::BSP1D and #grb::hybrid backends, that both
+		 *       share the exact same code, relying on either the #grb::reference or
+		 *       the #grb::reference_omp backend, respectively.
+		 *
+		 * \note The user documentation does not list all required fields; for a
+		 *       complete overview, see the developer documentation instead.
+		 *
+		 * The default class declaration is declared empty to ensure no one backend
+		 * implicitly relies on global defaults. Every backend therefore must
+		 * specialise this class and implement the specified functions.
 		 *
-		 * Empty by default so to ensure no-one implicitly relies on implicit
-		 * defaults.
+		 * \warning Portable ALP user code does not rely on the implementation details
+		 *          gathered in this class.
+		 *
+		 * \note For properties of a backend that may (also) affect ALP user code,
+		 *       see #grb::Properties.
+		 *
+		 * The user documentation only documents the settings that could be useful to
+		 * modify.
+		 *
+		 * \warning Modifying the documented functions should be done with care.
+		 *
+		 * \warning Any such modifications typically requires rebuilding the ALP
+		 *          library itself.
+		 *
+		 * \note For viewing all implementation choices, please see the developer
+		 *       documentation.
+		 *
+		 * \ingroup config
 		 */
-		template< grb::Backend implementation = default_backend >
-		class IMPLEMENTATION {};
+		template< grb::Backend backend = default_backend >
+		class IMPLEMENTATION {
+#ifdef __DOXYGEN__
+			public:
+
+				/**
+				 * Defines how private memory regions are allocated.
+				 *
+				 * @returns how a memory region that will not be accessed by threads other
+				 * than the allocating thread, should be allocated.
+				 */
+				static constexpr ALLOC_MODE defaultAllocMode();
+
+				/**
+				 * Defines how shared memory regions are allocated.
+				 *
+				 * @returns how a memory region that may be accessed by thread other than
+				 * the allocating thread, should be allocated.
+				 */
+				static constexpr ALLOC_MODE sharedAllocMode();
+
+				/**
+				 * \internal
+				 * @returns whether the selected backend implements vectors as having fixed
+				 * capacities. This is \em not a configuration choice for most backends,
+				 * but rather a fixed consequence of design choices.
+				 *
+				 * \note The only legal fixed capacity a functional ALP/GraphBLAS backend
+				 *       may provide is one that is equal to its size.
+				 *
+				 * \note A backend backed by a sparse accumulator (SPA) will typically have
+				 *       fixed vector capacities, whereas one based on sets or other types
+				 *       of tree structures will typically have dynamic vector capacities.
+				 * \endinternal
+				 */
+				static constexpr bool fixedVectorCapacities();
+#endif
+		};
 
 		/**
 		 * What data type should be used to store row indices.
@@ -284,6 +430,8 @@ namespace grb {
 		 *
 		 * \note The data type for indices of general arrays is not configurable. This
 		 *       set of implementations use <tt>size_t</tt> for those.
+		 *
+		 * \ingroup config
 		 */
 		typedef unsigned int RowIndexType;
 
@@ -295,6 +443,8 @@ namespace grb {
 		 *
 		 * \note The data type for indices of general arrays is not configurable. This
 		 *       set of implementations use <tt>size_t</tt> for those.
+		 *
+		 * \ingroup config
 		 */
 		typedef unsigned int ColIndexType;
 
@@ -306,6 +456,8 @@ namespace grb {
 		 *
 		 * \note The data type for indices of general arrays is not configurable. This
 		 *       set of implementations use <tt>size_t</tt> for those.
+		 *
+		 * \ingroup config
 		 */
 		typedef size_t NonzeroIndexType;
 
@@ -317,6 +469,8 @@ namespace grb {
 		 *
 		 * \note The data type for indices of general arrays is not configurable. This
 		 *       set of implementations use <tt>size_t</tt> for those.
+		 *
+		 * \ingroup config
 		 */
 		typedef unsigned int VectorIndexType;
 
diff --git a/include/graphblas/base/exec.hpp b/include/graphblas/base/exec.hpp
index 19e800e80..fefb10132 100644
--- a/include/graphblas/base/exec.hpp
+++ b/include/graphblas/base/exec.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies the #grb::Launcher functionalities.
+ *
  * @author A. N. Yzelman
  * @date 17th of April, 2017
  */
@@ -28,27 +32,31 @@
 
 #include <graphblas/backends.hpp>
 #include <graphblas/rc.hpp>
+
 #ifndef _GRB_NO_STDIO
-#include <iostream>
+ #include <iostream>
 #endif
 
+
 namespace grb {
 
 	/**
-	 * The various ways in which the #Launcher can be used
-	 * to execute a GraphBLAS program.
+	 * The various ways in which the #grb::Launcher can be used to execute an
+	 * ALP program.
 	 *
 	 * \warning An implementation may require different linker commands
-	 *          when using different modes. This is OK, since a call to
-	 *          the #Launcher is required to be quite different
-	 *          depending on which mode is used. The portability is in
-	 *          the GraphBLAS program being launched-- that one should
-	 *          never change depending on whichever mode it is used.
+	 *          when using different modes.
+	 *
+	 * \warning Depending on the mode given to #grb::Launcher, the parameters
+	 *          required for the exec function may differ.
+	 *
+	 * \note However, the ALP program is unaware of which mode is the launcher
+	 *       employs and will not have to change.
 	 */
 	enum EXEC_MODE {
 
 		/**
-		 * Automatic mode. The #Launcher can spawn user processes
+		 * Automatic mode. The #grb::Launcher can spawn user processes
 		 * which will execute a given program.
 		 */
 		AUTOMATIC = 0,
@@ -56,176 +64,268 @@ namespace grb {
 		/**
 		 * Manual mode. The user controls \a nprocs user processes
 		 * which together should execute a given program, by, for
-		 * example, using the #Launcher.
+		 * example, using the #grb::Launcher.
 		 */
 		MANUAL,
 
 		/**
 		 * When running from an MPI program. The user controls
 		 * \a nprocs MPI programs, which, together, should execute
-		 * a given GraphBLAS program.
+		 * a given ALP program.
 		 */
 		FROM_MPI
 
 	};
 
 	/**
-	 * Allows an auxiliary program to run any GraphBLAS program. Input data may be
-	 * passed through a user-defined type. Output data will be retrieved via the
-	 * same type. For implementations that support multiple user processes, the
-	 * caller may explicitly set the process ID and total number of user processes.
+	 * A group of user processes that together execute ALP programs.
 	 *
-	 * The intended use is to `just call' grb::exec which should, in its most
-	 * trivial form, compile regardless of which backend is selected.
+	 * Allows an application to run any ALP program. Input data may be passed
+	 * through a user-defined type. Output data will be retrieved via the same
+	 * type.
 	 *
-	 * @tparam mode           Which #EXEC_MODE the Launcher should adhere to.
-	 * @tparam implementation Which GraphBLAS implementation is to be used.
+	 * For backends that support multiple user processes, the caller may
+	 * explicitly set the process ID and total number of user processes.
+	 *
+	 * The intended use is to `just call' the exec function, which should be
+	 * accepted by any backend.
+	 *
+	 * @tparam mode    Which #EXEC_MODE the Launcher should adhere to.
+	 * @tparam backend Which backend is to be used.
 	 */
-	template< enum EXEC_MODE mode, enum Backend implementation >
+	template< enum EXEC_MODE mode, enum Backend backend >
 	class Launcher {
 
 		public :
 
 			/**
-		     * Constructs a new Launcher. This constructor is a collective
-		     * call; all \a nprocs processes that form a single Launcher
-		     * group must make a call to this constructor at roughly the
-		     * same time. There is an implementation-defined time-out for
-		     * the creation of a Launcher group.
-		     *
-		     * @param[in]  process_id  The user process ID of the calling process.
-		     *                         The value must be larger or equal to 0. This
-		     *                         value must be strictly smaller than \a nprocs.
-		     *                         This value must be unique to the calling
-		     *                         process within this collective call across
-		     *                         \em all \a nprocs user processes. This number
-		     *                         \em must be strictly smaller than \a nprocs.
-		     *                         Optional: the default is 0.
-		     * @param[in]  nprocs      The total number of user processes making a
-		     *                         collective call to this function. Optional: the
-		     *                         default is 1.
-		     * @param[in]  hostname    The hostname of one of the user processes.
-		     *                         Optional: the default is `localhost'.
-		     * @param[in]  port        A free port number at \a hostname. This port
-		     *                         will be used for TCP connections to \a hostname
-		     *                         if and only if \a nprocs is larger than one.
-		     *                         Optional: the default value is `0'.
-		     *
-		     * @throws invalid_argument If #nprocs is zero.
-		     * @throws invalid_argument If #process_id is greater than or
-		     *                          equal to \a nprocs.
-		     *
-		     * \note An implementation may define further constraints on
-		     *       the input arguments, such as, obviously, on \a hostname
-		     *       and \a port, but also on \a nprocs and, as a result, on
-		     *       \a process_id.
-		     */
-			Launcher( const size_t process_id = 0,        // user process ID
-				const size_t nprocs = 1,                  // total number of user processes
-				const std::string hostname = "localhost", // one of the user process hostnames
-				const std::string port = "0"              // a free port at hostname
-			) {                                           // standard does not specify any constrants on hostname and port
-		                                                  // so accept (and ignore) anything
-				(void)hostname; (void)port;
+			 * Constructs a new #grb::Launcher. This constructor is a collective call;
+			 * all \a nprocs processes that form a single launcher group must make a
+			 * simultaneous call to this constructor.
+			 *
+			 * There is an implementation-defined time-out for the creation of a launcher
+			 * group.
+			 *
+			 * @param[in]  process_id  The user process ID of the calling process.
+			 *                         The value must be larger or equal to 0. This
+			 *                         value must be strictly smaller than \a nprocs.
+			 *                         This value must be unique to the calling
+			 *                         process within this collective call across
+			 *                         \em all \a nprocs user processes. This number
+			 *                         \em must be strictly smaller than \a nprocs.
+			 *                         Optional: the default is 0.
+			 * @param[in]  nprocs      The total number of user processes making a
+			 *                         collective call to this function. Optional: the
+			 *                         default is 1.
+			 * @param[in]  hostname    The hostname of one of the user processes.
+			 *                         Optional: the default is `localhost'.
+			 * @param[in]  port        A free port number at \a hostname. This port
+			 *                         will be used for TCP connections to \a hostname
+			 *                         if and only if \a nprocs is larger than one.
+			 *                         Optional: the default value is `0'.
+			 *
+			 * @throws invalid_argument If \a nprocs is zero.
+			 * @throws invalid_argument If \a process_id is greater than or equal to
+			 *                          \a nprocs.
+			 *
+			 * \note An implementation or backend may define further constraints on the
+			 *       input arguments, such as, obviously, on \a hostname and \a port, but
+			 *       also on \a nprocs and, as a result, on \a process_id.
+
+			 * \note The most obvious is that backends supporting only one user process
+			 *       must not accept \a nprocs larger than 1.
+			 *
+			 * All aforementioned default values shall always be legal.
+			 */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				// spec does not specify any constrants on hostname and port
+				// so accept (and ignore) anything
+				(void) hostname; (void) port;
 
 #ifndef _GRB_NO_EXCEPTIONS
 				// sanity checks on process_id and nprocs
-				if( nprocs == 0 ) { throw std::invalid_argument( "Total number of user "
-																 "processes must be "
-																 "strictly larger than "
-																 "zero." ); }
-	if( process_id >= nprocs ) {
-		throw std::invalid_argument( "Process ID must be strictly smaller than "
-									 "total number of user processes." );
-	}
+				if( nprocs == 0 ) {
+					throw std::invalid_argument( "Total number of user processes must be "
+						"strictly larger than zero." );
+				}
+				if( process_id >= nprocs ) {
+					throw std::invalid_argument( "Process ID must be strictly smaller than "
+						"total number of user processes." );
+				}
 #endif
-} // namespace grb
+			}
 
-/**
- * Executes the given GraphBLAS program. This function, depending on whether
- * GraphBLAS is compiled in automatic or in manual mode, will either
- * \em spawn the maximum number of available user processes or will connect
- * exactly \a nprocs existing processes, respectively, to execute the given
- * \a grb_program.
- *
- * This is a collective function call.
- *
- * @tparam T The type of the data to pass to the GraphBLAS program.
- * @tparam U The type of the output data to pass back to the user.
- *
- * @param[in]  grb_program User GraphBLAS program to be executed.
- * @param[in]  data_in     Input data of user-defined type \a T.
- *                         When in automatic mode, the data will only be
- *                         available at user process 0 only. When in
- *                         manual mode, the data will be available to
- *                         this user process (with the below given
- *                         \a process_id) only.
- * @param[out] data_out    Output data of user-defined type \a U. The output
- *                         data should be available at user process with ID
- *                         zero.
- * @param[in]  broadcast   Whether the input should be broadcast from user
- *                         process 0 to all other user processes. Optional;
- *                         the default value is \a false.
- *
- * @return SUCCESS If the execution proceeded as intended.
- * @return PANIC   If an unrecoverable error was encountered while trying to
- *                 execute the given GraphBLAS program.
- *
- * \warning An implementation can define further constraints on the validity
- *          of input arguments. The most obvious is that implementations
- *          supporting only one user process will not accept \a nprocs larger
- *          than 1.
- *
- * All aforementioned default values shall always be legal.
- */
-template< typename T, typename U >
-RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-	const T & data_in,
-	U & data_out, // input & output data
-	const bool broadcast = false ) const {
-	(void)grb_program;
-	(void)data_in;
-	(void)data_out;
-	(void)broadcast;
-	// stub implementation, should be overridden by specialised implementation,
-	// so return error code
-	return PANIC;
-}
+			/**
+			 * Executes a given ALP program using the user processes encapsulated by this
+			 * launcher group.
+			 *
+			 * Calling this function, depending on whether the automatic or manual/MPI
+			 * mode was selected, will either \em spawn the maximum number of available
+			 * user processes and \em then execute the given program, \em or it will
+			 * employ the given processes that are managed by the user application and
+			 * used to construct this launcher instance to execute the given
+			 * \a alp_program.
+			 *
+			 * This is a collective function call-- all processes in the launcher group
+			 * must make a simultaneous call to this function and must do so using
+			 * consistent arguments.
+			 *
+			 * @tparam T The type of the data to pass to the ALP program as input.
+			 * @tparam U The type of the output data to pass back to the caller.
+			 *
+			 * @param[in]  alp_program The user program to be executed.
+			 * @param[in]  data_in     Input data of user-defined type \a T.
+			 *
+			 * When in automatic mode and \a broadcast is <tt>false</tt>, the data will
+			 * only be available at user process with ID 0. When in automatic mode and
+			 * \a broadcast is <tt>true</tt>, the data will be available at all user
+			 * processes. When in manual mode, the data will be available to this user
+			 * process only, with "this process" corresponding to the process that calls
+			 * this function.
+			 *
+			 * @param[out] data_out  Output data of user-defined type \a U. The output
+			 *                       data should be available at user process with ID
+			 *                       zero.
+			 * @param[in]  broadcast Whether the input should be broadcast from user
+			 *                       process 0 to all other user processes. Optional;
+			 *                       the default value is \a false.
+			 *
+			 * @return #grb::SUCCESS If the execution proceeded as intended.
+			 * @return #grb::PANIC   If an unrecoverable error was encountered while
+			 *                       attempting to execute, attempting to terminate, or
+			 *                       while executing, the given program.
+			 *
+			 * \warning Even if #grb::SUCCESS is returned, an algorithm may fail to
+			 *          achieve its intended result-- for example, an iterative solver
+			 *          may fail to converge. A good programming pattern has that \a U
+			 *          either a) is an error code for the algorithm used (e.g.,
+			 *          #grb::RC), or b) that \a U contains such an error code.
+			 */
+			template< typename T, typename U >
+			RC exec(
+				void ( *alp_program )( const T &, U & ),
+				const T &data_in,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				(void) alp_program;
+				(void) data_in;
+				(void) data_out;
+				(void) broadcast;
+				// stub implementation, should be overridden by specialised backend,
+				// so return error code
+				return PANIC;
+			}
 
-/**
- * Variable size version of the above function.
- *
- * @param[in]  broadcast   Whether the input should be broadcast from user
- *                         process 0 to all other user processes. Optional;
- *                         the default value is \a false. This will let user
- *                         processes with ID larger than zero allocate
- *                         \a in_size bytes of memory into which the data at
- *                         process 0 will be copied.
- *
- * \todo more documentation
- */
-template< typename U >
-RC exec( void ( *grb_program )( const void *, const size_t, U & ), const void * data_in, const size_t in_size, U & data_out, const bool broadcast = false ) const {
-	(void)grb_program;
-	(void)data_in;
-	(void)in_size;
-	(void)data_out;
-	(void)broadcast;
-	return PANIC;
-}
+			/**
+			 * Executes a given ALP program using the user processes encapsulated by this
+			 * launcher group.
+			 *
+			 * This variant of exec has that \a data_in is of a variable byte size,
+			 * instead of a fixed POD type. If \a broadcast is <tt>true</tt> and the
+			 * launcher is instantiated using the #grb::AUTOMATIC mode, all bytes are
+			 * broadcast to all user processes.
+			 *
+			 * @param[in]  alp_program The user program to be executed.
+			 * @param[in]  data_in     Pointer to raw input byte data.
+			 * @param[in]  in_size     The number of bytes the input data consists of.
+			 * @param[out] data_out  Output data of user-defined type \a U. The output
+			 *                       data should be available at user process with ID
+			 *                       zero.
+			 * @param[in]  broadcast Whether the input should be broadcast from user
+			 *                       process 0 to all other user processes. Optional;
+			 *                       the default value is \a false.
+			 *
+			 * @return #grb::SUCCESS If the execution proceeded as intended.
+			 * @return #grb::PANIC   If an unrecoverable error was encountered while
+			 *                       attempting to execute, attempting to terminate, or
+			 *                       while executing, the given program.
+			 *
+			 * For more details, see the other version of this function.
+			 */
+			template< typename U >
+			RC exec(
+				void ( *alp_program )( const void *, const size_t, U & ),
+				const void * data_in,
+				const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				(void) alp_program;
+				(void) data_in;
+				(void) in_size;
+				(void) data_out;
+				(void) broadcast;
+				return PANIC;
+			}
 
-/**
- * Releases all GraphBLAS resources. After a call to this function, no
- * GraphBLAS library functions may be called any longer.
- *
- * @return SUCCESS A call to this function may never fail.
- */
-static RC finalize() {
-	return PANIC;
-}
-}
-; // end class `Launcher'
+			/**
+			 * Releases all ALP resources.
+			 *
+			 * After a call to this function, no further ALP programs may launched using
+			 * the #grb::Launcher and #grb::Benchmarker. Also the use of #grb::init and
+			 * #grb::finalize will no longer be accepted.
+			 *
+			 * \warning #grb::init and #grb::finalize are deprecated.
+			 *
+			 * \internal
+			 * \todo Remove the above comments once #grb::init and #grb::finalize are
+			 *       moved to an internal namespace.
+			 * \endinternal
+			 *
+			 * After a call to this function, the only way to once again run ALP programs
+			 * is to use the #grb::Launcher from a new process.
+			 *
+			 * \warning Therefore, use this function with care and preferably only just
+			 *          before exiting the process.
+
+			 * A well-behaving program calls this function, or
+			 * #grb::Benchmarker::finalize, exactly once before its process terminates,
+			 * or just after the guaranteed last invocation of an ALP program.
+			 *
+			 * @return #grb::SUCCESS The resources have successfully and permanently been
+			 *                       released.
+			 * @return #grb::PANIC   An unrecoverable error has been encountered and the
+			 *                       user program is encouraged to exit as quickly as
+			 *                       possible. The state of the ALP library has become
+			 *                       undefined and should no longer be used.
+			 *
+			 * \note In the terminology of the Message Passing Interface (MPI), this
+			 *       function is the ALP equivalent of the <tt>MPI_Finalize()</tt>.
+			 *
+			 * \note In #grb::AUTOMATIC mode when using a parallel backend that uses MPI
+			 *       to auto-parallelise the ALP computations, MPI is never explicitly
+			 *       exposed to the user application. This use case necessitates the
+			 *       specification of this function.
+			 *
+			 * \note Thus, and in particular, an ALP program launched in #grb::AUTOMATIC
+			 *       mode while using the #grb::BSP1D or the #grb::hybrid backends with
+			 *       ALP compiled using LPF that in turn is configured to use an
+			 *       MPI-based engine, should make sure to call this function before
+			 *       program exit.
+			 *
+			 * \note An application that launches ALP programs in #grb::FROM_MPI mode
+			 *       must still call this function, even though a proper such application
+			 *       makes its own call to <tt>MPI_Finalize()</tt>. This does \em not
+			 *       induce improper behaviour since calling this function using a
+			 *       launcher instance in #grb::FROM_MPI mode translates, from an MPI
+			 *       perspective, to a no-op.
+			 *
+			 * \internal This is the base implementation that should be specialised by
+			 *           each backend separately.
+			 */
+			static RC finalize() {
+				return PANIC;
+			}
+
+	}; // end class `Launcher'
 
 } // end namespace ``grb''
 
 #endif // end _H_GRB_EXEC_BASE
+
diff --git a/include/graphblas/base/init.hpp b/include/graphblas/base/init.hpp
index ec6ca2529..285f9962b 100644
--- a/include/graphblas/base/init.hpp
+++ b/include/graphblas/base/init.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies the #grb::init and #grb::finalize functionalities.
+ *
  * @author A. N. Yzelman
  * @date 24th of January, 2017
  */
@@ -33,75 +37,85 @@ namespace grb {
 	/**
 	 * Initialises the calling user process.
 	 *
-	 * \deprecated Please use grb::Launcher instead. This primitive will be
+	 * \deprecated Please use #grb::Launcher instead. This primitive will be
 	 *             removed from verson 1.0 onwards.
 	 *
+	 * @tparam backend Which GraphBLAS backend this call to init initialises.
+	 *
+	 * By default, the backend that is selected by the user at compile-time is
+	 * used. If no backend was selected, #grb::reference is assumed.
+	 *
+	 * @param[in] s The ID of this user process.
+	 * @param[in] P The total number of user processes.
+	 *
 	 * If the backend supports multiple user processes, the user can invoke this
 	 * function with \a P equal to one or higher; if the backend supports only a
 	 * single user process, then \a P must equal one.
+	 *
 	 * The value for the user process ID \a s must be larger or equal to zero and
 	 * must be strictly smaller than \a P. If \a P > 1, each user process must
 	 * call this function collectively, each user process should pass the same
 	 * value for \a P, and each user process should pass a unique value for \a s
 	 * amongst all \a P collective calls made.
 	 *
+	 * @param[in] implementation_data Any implementation-defined data structure
+	 *                                required for successful completion of this
+	 *                                call.
+	 *
 	 * An implementation may define that additional data is required for a call to
 	 * this function to complete successfully. Such data may be passed via the
 	 * final argument to this function, \a implementation_data.
 	 *
 	 * If the implementation does not support multiple user processes, then a
-	 * value for \a implementation_data shall not be required. In parcticular, a
+	 * value for \a implementation_data shall not be required. In particular, a
 	 * call to this function with an empty parameter list shall then be legal
 	 * and infer the following default arguments: zero for \a s, one for \a P,
 	 * and \a NULL for \a implementation_data. When such an implementation is
-	 * requested to initialise multiple user processes, the grb::UNSUPPORTED
-	 * error code shall be returned.
-	 *
-	 * A call to this function must be matched with a call to grb::finalize().
-	 * After a successful call to this function, a new call to grb::init() without
-	 * first calling grb::finalize() shall incur undefined behaviour. The
-	 * construction of GraphBLAS containers without a preceding successful call
-	 * to grb::init() will result in invalid GraphBLAS objects. Any valid
-	 * GraphBLAS containers will become invalid after a call to grb::finalize().
-	 * Any use of GraphBLAS functions on invalid containers will result in
-	 * undefined behaviour.
-	 *
-	 * @tparam backend Which GraphBLAS backend this call to init initialises.
+	 * requested to initialise multiple user processes, then #grb::UNSUPPORTED
+	 * shall be returned.
 	 *
-	 * @param[in] s The ID of this user process.
-	 * @param[in] P The total number of user processes.
-	 * @param[in] implementation_data Any implementation-defined data structure
-	 *                                required for successful completion of this
-	 *                                call.
+	 * A call to this function must be matched with a call to #grb::finalize.
+	 * After a successful call to this function, a new call to #grb::init without
+	 * first calling #grb::finalize shall incur undefined behaviour. The
+	 * construction of ALP/GraphBLAS containers without a preceding successful call
+	 * to #grb::init will result in undefined behaviour. Any valid GraphBLAS
+	 * containers will become invalid after a call to #grb::finalize.
 	 *
+	 * \internal
 	 * \note For a pure MPI implementation, for instance, \a implementation_data
 	 *       may be a pointer to the MPI communicator corresponding to these user
 	 *       processes.
 	 *
-	 * \note The implementations based on PlatformBSP require direct passing of
-	 *       the \a bsp_t corresponding to the BSP context of the user processes;
-	 *       this is legal since the PlatformBSP specification defines the
-	 *       \a bsp_t type as a void pointer.
+	 * \note The implementations based on LPF require direct passing of the
+	 *       \a lpf_t corresponding to the BSP context of the user processes;
+	 *       this is legal since the LPF defines the \a lpf_t type as a void
+	 *       pointer.
+	 * \endinternal
 	 *
 	 * @return SUCCESS     If the initialisation was successful.
 	 * @return UNSUPPORTED When the implementation does not support multiple
-	 *                     user processes (\a P larger than 1). After a call to
-	 *                     this function exits with this error code the library
-	 *                     state  shall be as though the call never were made.
-	 * @return PANIC       If this function fails, the state of this GraphBLAS
-	 *                     implementation becomes undefined.
+	 *                     user processes while the given \a P was larger than 1.
+	 * @return PANIC       If returned, the state of the ALP library becomes
+	 *                     undefined.
+	 *
+	 * After a call to this function that exits with a non-SUCCESS and non-PANIC
+	 * error code, the program shall behave as though the call were never made.
 	 *
 	 * \note There is no argument checking. If \a s is larger or equal to \a P,
 	 *       undefined behaviour occurs. If \a implementation_data was invalid
 	 *       or corrupted, undefined behaviour occurs.
 	 *
+	 * \internal
+	 * \todo Define #grb::ILLEGAL to be returned if \f$ s \geq P \f$.
+	 * \endinternal
+	 *
 	 * \par Performance semantics
-	 *      None. Implementations are encouraged to specify the complexity of
-	 *      their implementation of this function in terms of \a P.
+	 *      Implementations and backends must specify the complexity of this
+	 *      function in terms of \a P.
 	 *
 	 * \note Compared to the GraphBLAS C specification, this function lacks a
 	 *       choice whether to execute in `blocking' or `non-blocking' mode. With
-	 *       ALP/GraphBLAS, the backend controls whether execution proceeds in a
+	 *       ALP, the selected backend controls whether execution proceeds in a
 	 *       non-blocking manner or not. Thus selecting a blocking backend for
 	 *       compilation results in the application of blocking semantics, while
 	 *       selecting a non-blocking backend results in the application of non-
@@ -110,16 +124,22 @@ namespace grb {
 	 *       valid implementation of a non-blocking mode. Therefore, this
 	 *       specification will still yield a valid C API implementation when
 	 *       properly wrapping around a blocking ALP/GraphBLAS backend.
-	 * \note This specification allows for grb::init() to be called multiple
-	 *       times from the same process and the same thread. The parameters \a s
-	 *       and \a P (and \a implementation_data) may differ each time. Each
-	 *       (repeated) call must of course meet all the above requirements.
+	 * \note This specification allows for #grb::init to be called multiple times
+	 *       from the same process and the same thread. The parameters \a s and
+	 *       \a P (and \a implementation_data) may differ each time. Each
+	 *       (repeated) call must of course continue to meet all the above
+	 *       requirements.
 	 * \note The GraphBLAS C API does not have the notion of user processes. We
 	 *       believe this notion is necessary to properly integrate into parallel
 	 *       frameworks, and also to affect proper and efficient parallel I/O.
 	 *
 	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          your code to use the grb::Launcher instead.
+	 *          your code to use the #grb::Launcher instead.
+	 *
+	 * \internal The implementation will be retained after deprecation has been
+	 *           pushed through, as the #grb::Launcher depends on it. However, the
+	 *           #grb::init and #grb::finalize must then be moved into the
+	 *           #grb::internal namespace.
 	 */
 	template< enum Backend backend = config::default_backend >
 	RC init( const size_t s, const size_t P, void * const implementation_data ) {
@@ -132,42 +152,42 @@ namespace grb {
 	/**
 	 * Initialises the calling user process.
 	 *
-	 * \deprecated Please use grb::Launcher instead. This primitive will be
+	 * \deprecated Please use #grb::Launcher instead. This primitive will be
 	 *             removed from verson 1.0 onwards.
 	 *
 	 * This variant takes no input arguments. It will assume a single user process
 	 * exists; i.e., the call is equivalent to one to #grb::init with \a s zero
-	 * and \a P one.
+	 * and \a P one (and \a implementation_data <tt>NULL</tt>).
 	 *
 	 * @tparam backend The backend implementation to initialise.
 	 *
 	 * @return SUCCESS     If the initialisation was successful.
-	 * @return PANIC       If this function fails, the state of this GraphBLAS
-	 *                     implementation becomes undefined.
+	 * @return PANIC       If returned, the state of the ALP library becomes
+	 *                     undefined.
 	 *
 	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          your code to use the grb::Launcher instead.
+	 *          your code to use the #grb::Launcher instead.
 	 */
 	template< enum Backend backend = config::default_backend >
 	RC init() {
-		return grb::init< backend >( 0, 1, NULL );
+		return grb::init< backend >( 0, 1, nullptr );
 	}
 
 	/**
-	 * Finalises an ALP/GraphBLAS context opened by the last call to grb::init().
+	 * Finalises an ALP/GraphBLAS context opened by the last call to #grb::init.
 	 *
-	 * \deprecated Please use grb::Launcher instead. This primitive will be
+	 * \deprecated Please use #grb::Launcher instead. This primitive will be
 	 *             removed from verson 1.0 onwards.
 	 *
 	 * This function must be called collectively and must follow a call to
-	 * grb::init(). After successful execution of this function, a new call
-	 * to grb::init() may be made.
+	 * #grb::init. After successful execution of this function, a new call to
+	 * #grb::init may be made. (This function is re-entrant.)
 	 *
 	 * After a call to this function, any ALP/GraphBLAS objects that remain in
 	 * scope become invalid.
 	 *
 	 * \warning Invalid ALP/GraphBLAS containers will remain invalid no matter if a
-	 *          next call to grb::init() is made.
+	 *          next call to #grb::init is made.
 	 *
 	 * @tparam backend Which ALP/GraphBLAS backend to finalise.
 	 *
@@ -176,15 +196,15 @@ namespace grb {
 	 *                 implementation becomes undefined. This means none of its
 	 *                 functions should be called during the remainder program
 	 *                 execution; in particular this means a new call to
-	 *                 grb::init() will not remedy the situaiton.
+	 *                 #grb::init will not remedy the situation.
 	 *
 	 * \par Performance semantics
 	 *      None. Implementations are encouraged to specify the complexity of
 	 *      their implementation of this function in terms of the parameter
-	 *      \a P the matching call to grb::init() was called with.
+	 *      \a P the matching call to #grb::init was called with.
 	 *
 	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          your code to use the grb::Launcher instead.
+	 *          your code to use the #grb::Launcher instead.
 	 */
 	template< enum Backend backend = config::default_backend >
 	RC finalize() {
diff --git a/include/graphblas/base/internalops.hpp b/include/graphblas/base/internalops.hpp
index 1ec5ce508..668534da9 100644
--- a/include/graphblas/base/internalops.hpp
+++ b/include/graphblas/base/internalops.hpp
@@ -54,85 +54,94 @@ namespace grb {
 			class argmin {
 
 				static_assert( std::is_integral< IType >::value,
-					"Argmin operator may only be constructed using integral index "
-					"types." );
-
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef std::pair< IType, VType > left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef std::pair< IType, VType > right_type;
-
-				/** Alias to the output data type. */
-				typedef std::pair< IType, VType > result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply(
-					const left_type * __restrict__ const a,
-					const right_type * __restrict__ const b,
-					result_type * __restrict__ const c
-				) {
-					if( a->second < b->second ) {
-						*c = *a;
-					} else {
-						*c = *b;
+					"Argmin operator may only be constructed using integral index types." );
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef std::pair< IType, VType > left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef std::pair< IType, VType > right_type;
+
+					/** Alias to the output data type. */
+					typedef std::pair< IType, VType > result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( a->second < b->second ) {
+							*c = *a;
+						} else {
+							*c = *b;
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( a->second < c->second ) {
+							c->first = a->first;
+							c->second = a->second;
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( b->second <= c->second ) {
+							c->first = b->first;
+							c->second = b->second;
+						}
 					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( a->second < c->second ) {
-						c->first = a->first;
-						c->second = a->second;
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( b->second <= c->second ) {
-						c->first = b->first;
-						c->second = b->second;
-					}
-				}
+
 			};
 
 			/**
@@ -148,85 +157,92 @@ namespace grb {
 			class argmax {
 
 				static_assert( std::is_integral< IType >::value,
-					"Argmin operator may only be constructed using integral index "
-					"types." );
-
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef std::pair< IType, VType > left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef std::pair< IType, VType > right_type;
-
-				/** Alias to the output data type. */
-				typedef std::pair< IType, VType > result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply(
-					const left_type * __restrict__ const a,
-					const right_type * __restrict__ const b,
-					result_type * __restrict__ const c
-				) {
-					if( a->second > b->second ) {
-						*c = *a;
-					} else {
-						*c = *b;
+					"Argmin operator may only be constructed using integral index types." );
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef std::pair< IType, VType > left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef std::pair< IType, VType > right_type;
+
+					/** Alias to the output data type. */
+					typedef std::pair< IType, VType > result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( a->second > b->second ) {
+							*c = *a;
+						} else {
+							*c = *b;
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( a->second > c->second ) {
+							c->first = a->first;
+							c->second = a->second;
+						}
 					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( a->second > c->second ) {
-						c->first = a->first;
-						c->second = a->second;
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( b->second >= c->second ) {
-						c->first = b->first;
-						c->second = b->second;
-					}
-				}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( b->second >= c->second ) {
+							c->first = b->first;
+							c->second = b->second;
+						}
+					}
+
 			};
 
 			/**
@@ -256,73 +272,88 @@ namespace grb {
 			 * @tparam IN2 The right-hand input data type.
 			 * @tparam OUT The output data type.
 			 */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
 			class left_assign {
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of the addition c = a.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					(void)b;
-					*c = static_cast< result_type >( *a );
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c = static_cast< result_type >( *a );
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					(void)b;
-					(void)c;
-				}
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of the addition c = a.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						(void) b;
+						*c = static_cast< result_type >( *a );
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c = static_cast< result_type >( *a );
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						(void) b;
+						(void) c;
+					}
+
 			};
 
 			/**
@@ -352,73 +383,90 @@ namespace grb {
 			 * @tparam IN2 The right-hand input data type.
 			 * @tparam OUT The output data type.
 			 */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
 			class right_assign {
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of the addition c = a.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					(void)a;
-					*c = *b;
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					(void)a;
-					(void)c;
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					*c = static_cast< result_type >( *b );
-				}
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of the addition c = a.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						(void) a;
+						*c = *b;
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						(void) a;
+						(void) c;
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						*c = static_cast< result_type >( *b );
+					}
+
 			};
 
 			/**
@@ -429,77 +477,94 @@ namespace grb {
 			 *
 			 * If \f$ x \f$ does not evaluate true the operator shall have no effect.
 			 */
-			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+			template<
+				typename D1, typename D2, typename D3,
+				enum Backend implementation = config::default_backend
+			>
 			class left_assign_if {
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef D1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef D2 right_type;
-
-				/** Alias to the output data type. */
-				typedef D3 result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the addition c = a.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
-					if( static_cast< const bool >( *b ) ) {
-						*c = *a;
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
-					if( static_cast< const bool >( *c ) ) {
-						*c = static_cast< D3 >( *a );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
-					if( static_cast< bool >( *b ) ) {
-						*c = static_cast< D3 >( static_cast< D1 >( *c ) );
-					}
-				}
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef D1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef D2 right_type;
+
+					/** Alias to the output data type. */
+					typedef D3 result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the addition c = a.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const D1 * __restrict__ const a,
+						const D2 * __restrict__ const b,
+						D3 * __restrict__ const c
+					) {
+						if( static_cast< const bool >( *b ) ) {
+							*c = *a;
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const D1 * __restrict__ const a,
+						D3 * __restrict__ const c
+					) {
+						if( static_cast< const bool >( *c ) ) {
+							*c = static_cast< D3 >( *a );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						D3 * __restrict__ const c,
+						const D2 * __restrict__ const b
+					) {
+						if( static_cast< bool >( *b ) ) {
+							*c = static_cast< D3 >( static_cast< D1 >( *c ) );
+						}
+					}
+
 			};
 
 			/**
@@ -510,77 +575,94 @@ namespace grb {
 			 *
 			 * If \f$ x \f$ does not evaluate true the operator shall have no effect.
 			 */
-			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+			template<
+				typename D1, typename D2, typename D3,
+				enum Backend implementation = config::default_backend
+			>
 			class right_assign_if {
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef D1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef D2 right_type;
-
-				/** Alias to the output data type. */
-				typedef D3 result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the addition c = a.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 */
-				static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
-					if( static_cast< const bool >( *a ) ) {
-						*c = *b;
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef D1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef D2 right_type;
+
+					/** Alias to the output data type. */
+					typedef D3 result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the addition c = a.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 */
+					static void apply(
+						const D1 * __restrict__ const a,
+						const D2 * __restrict__ const b,
+						D3 * __restrict__ const c
+					) {
+						if( static_cast< const bool >( *a ) ) {
+							*c = *b;
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const D1 * __restrict__ const a,
+						D3 * __restrict__ const c
+					) {
+						if( static_cast< const bool >( *a ) ) {
+							*c = static_cast< D3 >( static_cast< D2 >( *c ) );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						D3 * __restrict__ const c,
+						const D2 * __restrict__ const b
+					) {
+						if( static_cast< bool >( *c ) ) {
+							*c = static_cast< D3 >( *b );
+						}
 					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
-					if( static_cast< const bool >( *a ) ) {
-						*c = static_cast< D3 >( static_cast< D2 >( *c ) );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
-					if( static_cast< bool >( *c ) ) {
-						*c = static_cast< D3 >( *b );
-					}
-				}
+
 			};
 
 			/**
@@ -603,92 +685,102 @@ namespace grb {
 			 * @tparam OUT The output data type.
 			 */
 			// [Example Base Operator Implementation]
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend >
 			class add {
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an inplace foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an inplace foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the addition c = a + b.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * \warning Passing invalid pointers will result in UB.
-				 */
-				static void apply( const left_type * __restrict__ const a,
-					const right_type * __restrict__ const b,
-					result_type * __restrict__ const c
-				) {
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
-					                                    // see internal issue 306 for rationale
-					*c = *a + *b;
-					GRB_UTIL_RESTORE_WARNINGS
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 *
-				 * \warning Passing invalid pointers will result in UB.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
-					                                    // see internal issue 306 for rationale
-					*c += *a;
-					GRB_UTIL_RESTORE_WARNINGS
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 *
-				 * \warning Passing invalid pointers will result in UB.
-				 */
-				static void foldl(
-					result_type * __restrict__ const c,
-					const right_type * __restrict__ const b
-				) {
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
-					                                    // see internal issue 306 for rationale
-					*c += *b;
-					GRB_UTIL_RESTORE_WARNINGS
-				}
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an inplace foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an inplace foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the addition c = a + b.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see internal issue 306 for rationale
+						*c = *a + *b;
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see internal issue 306 for rationale
+						*c += *a;
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 *
+					 * \warning Passing invalid pointers will result in UB.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see internal issue 306 for rationale
+						*c += *b;
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
 			};
 			// [Example Base Operator Implementation]
 
@@ -707,1131 +799,1898 @@ namespace grb {
 			 * explicit definition as a GraphBLAS operator with the #is_associative and
 			 * #is_commutative fields, and others, set as required.
 			 *
-			 * @tparam IN1 The left-hand input data type.
-			 * @tparam IN2 The right-hand input data type.
-			 * @tparam OUT The output data type.
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class mul {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the multiplication c = a * b.
+					 *
+					 * @param[in]  a Pointer to the left-hand side input. Must be initialised.
+					 * @param[in]  b Pointer to the right-hand side input. Must be initialised.
+					 * @param[out] c Pointer to where to compute the output.
+					 *
+					 * \warning All pointers must be valid or UB occurs.
+					 */
+					static void apply(
+							const left_type * __restrict__ const a,
+							const right_type * __restrict__ const b,
+							result_type * __restrict__ const c
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see internal issue 306 for rationale
+						*c = *a * *b;
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c *= *a;
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						*c *= *b;
+					}
+
+			};
+
+			/**
+			 * Standard max operator.
+			 *
+			 * Assumes native availability of < on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) <
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class max {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the max operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \max\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *b ) {
+							*c = static_cast< OUT >( *b );
+						} else {
+							*c = static_cast< OUT >( *a );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a > *c ) {
+							*c = *a;
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b > *c ) {
+							*c = *b;
+						}
+					}
+
+			};
+
+			/**
+			 * Standard min operator.
+			 *
+			 * Assumes native availability of > on the given data types, or assumes
+			 * the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) >
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class min {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of the min operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a > *b ) {
+							*c = static_cast< OUT >( *b );
+						} else {
+							*c = static_cast< OUT >( *a );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *c ) {
+							*c = *a;
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b < *c ) {
+							*c = *b;
+						}
+					}
+
+			};
+
+			/**
+			 * Standard numerical subtraction operator.
+			 *
+			 * Assumes native availability of - on the given data types, or assumes
+			 * that the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) -
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class substract {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = *a - *b;
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c = *a - *c;
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						*c -= *b;
+					}
+
+			};
+
+			/**
+			 * Standard numerical division operator.
+			 *
+			 * Assumes native availability of / on the given data types, or assumes
+			 * that the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) /
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class divide {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = a/b \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = *a / *b;
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c = *a / *c;
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						*c /= *b;
+					}
+
+			};
+
+			/**
+			 * Non-standard numerical division where the inputs are switched.
+			 *
+			 * I.e., if the left input is \f$ l \f$ and the right input is \f$ r \f$,
+			 * then this operator computes \f$ r / l \f$.
+			 *
+			 * Assumes native availability of / on the given data types, or assumes
+			 * that the relevant operators are properly overloaded.
+			 *
+			 * Non-standard or non-matching data types, or non-standard (overloaded) /
+			 * operators, should be used with caution and may necessitate an explicit
+			 * definition as a GraphBLAS operator with the #is_associative and
+			 * #is_commutative fields, and others, set as required.
+			 *
+			 * @tparam IN1 The left-hand input data type.
+			 * @tparam IN2 The right-hand input data type.
+			 * @tparam OUT The output data type.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class divide_reverse {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = b/a \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = *b / *a;
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c /= *a;
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						*c = *b / *c;
+					}
+
+			};
+
+			/**
+			 * The equals operator.
+			 *
+			 * Assumes that the == operator for the given input types is defined.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class equal {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c \f$ will be set to
+					 * <tt>static_cast<result_type>(true)</tt> if \f$ a == b \f$, and to
+					 * <tt>static_cast<result_type>(false)</tt> otherwise.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a == *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a == *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b == *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+			};
+
+			/**
+			 * Standard not-equals operator.
+			 *
+			 * Assumes that the != operator is defined on the given input types.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class not_equal {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see internal issue 306 for rationale
+						if( *a != *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a != *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b != *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+			};
+
+			/**
+			 * A non-standard operator that returns any input that evaluates to
+			 * <tt>true</tt> when cast to a <tt>bool</tt>, \em or, in case that
+			 * no input evaluates <tt>true</tt>, returns any input.
+			 *
+			 * In case the input and output types are <tt>bool</tt>, this operator
+			 * corresponds to the classical logical or.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class any_or {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a ) {
+							*c = static_cast< OUT >( *a );
+						} else if( *b ) {
+							*c = static_cast< OUT >( *b );
+						} else {
+							assert( !( *a ) );
+							*c = static_cast< OUT >( *a );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a ) {
+							*c = static_cast< result_type >( *a );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b ) {
+							*c = static_cast< result_type >( *b );
+						}
+					}
+
+			};
+
+			/**
+			 * The logical or operator, \f$ x \lor y \f$.
+			 *
+			 * Assumes that the || operator is defined on the given input types.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class logical_or {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						printf( "Hello from mul\n" );
+						if( *a || *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a || *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b || *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+			};
+
+			/**
+			 * The logical-and operator, \f$ x \land y \f$.
+			 *
+			 * Assumes that the && operator is defined for the given input types.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class logical_and {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a && *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a && *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+						                                    // see docs/Suppressions.md
+						if( *b && *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+						GRB_UTIL_RESTORE_WARNINGS
+					}
+
+			};
+
+			/**
+			 * Absolute difference operator, \f$ |x-y| \f$.
+			 *
+			 * Assumes that the - and < operators are defined for the given input
+			 * types.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class abs_diff {
+
+				public:
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *b ) {
+							*c = static_cast< OUT >( *b - *a );
+						} else {
+							*c = static_cast< OUT >( *a - *b );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *c ) {
+							*c -= *a;
+						} else {
+							*c = static_cast< OUT >( *a - *c );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b < *c ) {
+							*c -= *b;
+						} else {
+							*c = static_cast< OUT >( *b - *c );
+						}
+					}
+
+			};
+
+			/**
+			 * ReLU operator as commonly used in machine learning, except it is here
+			 * interpreted as a binary operator.
+			 *
+			 * The inputs to this binary function are assumed to be the threshold value
+			 * and the input signal.
+			 *
+			 * ReLU is in fact functionally equal to #grb::operators::min.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class relu {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = true;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = true;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = ReLU\{a,b\} = \begin{cases}
+					 *   a \text{, if } a>b \\
+					 *   b \text{, otherwise}
+					 * \end{cases}\f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *b ) {
+							*c = static_cast< OUT >( *b );
+						} else {
+							*c = static_cast< OUT >( *a );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a > *c ) {
+							*c = *a;
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b > *c ) {
+							*c = *b;
+						}
+					}
+
+			};
+
+			/**
+			 * Square difference operator: \f$ (x-y)^2 \f$.
+			 *
+			 * Assumes that the - and * operators are defined on the given input types.
+			 */
+			template<
+				typename D1, typename D2, typename D3,
+				enum Backend implementation = config::default_backend
+			>
+			class square_diff {
+
+				public:
+
+					typedef D1 left_type;
+					typedef D2 right_type;
+					typedef D3 result_type;
+
+					static constexpr bool has_foldl = true;
+					static constexpr bool has_foldr = true;
+					static constexpr bool is_associative = false;
+					static constexpr bool is_commutative = true;
+
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = ( *a - *b ) * ( *a - *b );
+					}
+
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						*c = ( *a - *c ) * ( *a - *c );
+					}
+
+					static void foldl(
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = ( *c - *b ) * ( *c - *b );
+					}
+
+			};
+
+			/**
+			 * Zips two inputs into a pair.
+			 *
+			 * @tparam IN1 Left operand type.
+			 * @tparam IN2 Right operand type.
+			 *
+			 * The result type is fixed at <tt>std::pair< IN1, IN2 ></tt>.
+			 *
+			 * May be used together with argmin and other operators defined on pairs.
+			 */
+			template<
+				typename IN1, typename IN2,
+				enum Backend implementation = config::default_backend
+			>
+			class zip {
+
+				public:
+
+					typedef IN1 left_type;
+					typedef IN2 right_type;
+					typedef std::pair< IN1, IN2 > result_type;
+
+					static constexpr bool has_foldl = false;
+					static constexpr bool has_foldr = false;
+					static constexpr bool is_associative = false;
+					static constexpr bool is_commutative = false;
+
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						*c = std::make_pair( *a, *b );
+					}
+
+			};
+
+			/**
+			 * Whether the first argument of two given pairs compare equal.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class equal_first {
+
+				public:
+
+					typedef IN1 left_type;
+					typedef IN2 right_type;
+					typedef OUT result_type;
+
+					static constexpr bool has_foldl = false;
+					static constexpr bool has_foldr = false;
+					static constexpr bool is_associative = false;
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c = a->first == b->first \f$.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( a->first == b->first ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
+
+			};
+
+			/**
+			 * The less-than operator.
+			 *
+			 * Assumes that the < operator for the given input types is defined.
 			 */
 			template<
 				typename IN1, typename IN2, typename OUT,
 				enum Backend implementation = config::default_backend
 			>
-			class mul {
-
-			public:
-
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the multiplication c = a * b.
-				 *
-				 * @param[in]  a Pointer to the left-hand side input. Must be initialised.
-				 * @param[in]  b Pointer to the right-hand side input. Must be initialised.
-				 * @param[out] c Pointer to where to compute the output.
-				 *
-				 * \warning All pointers must be valid or UB occurs.
-				 */
-				static void apply(
+			class lt {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c \f$ will be set to
+					 * <tt>static_cast<result_type>(true)</tt> if \f$ a < b \f$, and to
+					 * <tt>static_cast<result_type>(false)</tt> otherwise.
+					 */
+					static void apply(
 						const left_type * __restrict__ const a,
 						const right_type * __restrict__ const b,
 						result_type * __restrict__ const c
-				) {
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
-					                                    // see internal issue 306 for rationale
-					*c = *a * *b;
-					GRB_UTIL_RESTORE_WARNINGS
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c *= *a;
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					*c *= *b;
-				}
-			};
+					) {
+						if( *a < *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
 
-			/**
-			 * Standard max operator.
-			 *
-			 * Assumes native availability of < on the given data types, or assumes
-			 * the relevant operators are properly overloaded.
-			 *
-			 * Non-standard or non-matching data types, or non-standard (overloaded) <
-			 * operators, should be used with caution and may necessitate an explicit
-			 * definition as a GraphBLAS operator with the #is_associative and
-			 * #is_commutative fields, and others, set as required.
-			 *
-			 * @tparam IN1 The left-hand input data type.
-			 * @tparam IN2 The right-hand input data type.
-			 * @tparam OUT The output data type.
-			 */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class max {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the max operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \max\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a < *b ) {
-						*c = static_cast< OUT >( *b );
-					} else {
-						*c = static_cast< OUT >( *a );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a > *c ) {
-						*c = *a;
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b > *c ) {
-						*c = *b;
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a < *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b < *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
 					}
-				}
+
 			};
 
 			/**
-			 * Standard min operator.
-			 *
-			 * Assumes native availability of > on the given data types, or assumes
-			 * the relevant operators are properly overloaded.
-			 *
-			 * Non-standard or non-matching data types, or non-standard (overloaded) >
-			 * operators, should be used with caution and may necessitate an explicit
-			 * definition as a GraphBLAS operator with the #is_associative and
-			 * #is_commutative fields, and others, set as required.
+			 * The greater-than operator.
 			 *
-			 * @tparam IN1 The left-hand input data type.
-			 * @tparam IN2 The right-hand input data type.
-			 * @tparam OUT The output data type.
+			 * Assumes that the > operator for the given input types is defined.
 			 */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class min {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of the min operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a > *b ) {
-						*c = static_cast< OUT >( *b );
-					} else {
-						*c = static_cast< OUT >( *a );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a < *c ) {
-						*c = *a;
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b < *c ) {
-						*c = *b;
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class gt {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c \f$ will be set to
+					 * <tt>static_cast<result_type>(true)</tt> if \f$ a > b \f$, and to
+					 * <tt>static_cast<result_type>(false)</tt> otherwise.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a > *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
 					}
-				}
-			};
-
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class substract {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = false;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = *a - *b;
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c = *a - *c;
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					*c -= *b;
-				}
-			};
-
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class divide {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = false;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = a/b \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = *a / *b;
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c = *a / *c;
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					*c /= *b;
-				}
-			};
-
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class divide_reverse {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = false;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = b/a \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = *b / *a;
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c /= *a;
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					*c = *b / *c;
-				}
-			};
-
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class equal {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a == *b ) {
-						*c = static_cast< OUT >( true );
-					} else {
-						*c = static_cast< OUT >( false );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a == *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b == *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-			};
-
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class not_equal {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
-					                                    // see internal issue 306 for rationale
-					if( *a != *b ) {
-						*c = static_cast< OUT >( true );
-					} else {
-						*c = static_cast< OUT >( false );
-					}
-					GRB_UTIL_RESTORE_WARNINGS
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a != *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b != *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-			};
 
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class any_or {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a ) {
-						*c = static_cast< OUT >( *a );
-					} else if( *b ) {
-						*c = static_cast< OUT >( *b );
-					} else {
-						assert( ! ( *a ) );
-						*c = static_cast< OUT >( *a );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a ) {
-						*c = static_cast< result_type >( *a );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b ) {
-						*c = static_cast< result_type >( *b );
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a > *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
 					}
-				}
-			};
 
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class logical_or {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					printf( "Hello from mul\n" );
-					if( *a || *b ) {
-						*c = static_cast< OUT >( true );
-					} else {
-						*c = static_cast< OUT >( false );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a || *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b || *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-			};
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b > *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
 
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class logical_and {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a && *b ) {
-						*c = static_cast< OUT >( true );
-					} else {
-						*c = static_cast< OUT >( false );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a && *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b && *c ) {
-						*c = static_cast< result_type >( true );
-					} else {
-						*c = static_cast< result_type >( false );
-					}
-				}
 			};
 
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class abs_diff {
+			/**
+			 * The less-than-or-equal operator.
+			 *
+			 * Assumes that the <= operator for the given input types is defined.
+			 */
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class leq {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c \f$ will be set to
+					 * <tt>static_cast<result_type>(true)</tt> if \f$ a \leq b \f$, and to
+					 * <tt>static_cast<result_type>(false)</tt> otherwise.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a <= *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
 
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = false;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = \min\{a,b\} \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a < *b ) {
-						*c = static_cast< OUT >( *b - *a );
-					} else {
-						*c = static_cast< OUT >( *a - *b );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a < *c ) {
-						*c -= *a;
-					} else {
-						*c = static_cast< OUT >( *a - *c );
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b < *c ) {
-						*c -= *b;
-					} else {
-						*c = static_cast< OUT >( *b - *c );
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a <= *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
 					}
-				}
-			};
 
-			/** \todo add documentation */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class relu {
-			public:
-				/** Alias to the left-hand input data type. */
-				typedef IN1 left_type;
-
-				/** Alias to the right-hand input data type. */
-				typedef IN2 right_type;
-
-				/** Alias to the output data type. */
-				typedef OUT result_type;
-
-				/** Whether this operator has an in-place foldl. */
-				static constexpr bool has_foldl = true;
-
-				/** Whether this operator has an in-place foldr. */
-				static constexpr bool has_foldr = true;
-
-				/**
-				 * Whether this operator is \em mathematically associative; that is,
-				 * associative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_associative = true;
-
-				/**
-				 * Whether this operator is \em mathematically commutative; that is,
-				 * commutative when assuming equivalent data types for \a IN1, \a IN2,
-				 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
-				 */
-				static constexpr bool is_commutative = true;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = ReLU\{a,b\} = \begin{cases}
-				 *   a \text{, if } a>b \\
-				 *   b \text{, otherwise}
-				 * \end{cases}\f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( *a < *b ) {
-						*c = static_cast< OUT >( *b );
-					} else {
-						*c = static_cast< OUT >( *a );
-					}
-				}
-
-				/**
-				 * In-place left-to-right folding.
-				 *
-				 * @param[in]     a Pointer to the left-hand side input data.
-				 * @param[in,out] c Pointer to the right-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 */
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					if( *a > *c ) {
-						*c = *a;
-					}
-				}
-
-				/**
-				 * In-place right-to-left folding.
-				 *
-				 * @param[in,out] c Pointer to the left-hand side input data. This also
-				 *                  dubs as the output memory area.
-				 * @param[in]     b Pointer to the right-hand side input data.
-				 */
-				static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
-					if( *b > *c ) {
-						*c = *b;
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b <= *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
 					}
-				}
-			};
 
-			template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
-			class square_diff {
-			public:
-				typedef D1 left_type;
-				typedef D2 right_type;
-				typedef D3 result_type;
-
-				static constexpr bool has_foldl = true;
-				static constexpr bool has_foldr = true;
-				static constexpr bool is_associative = false;
-				static constexpr bool is_commutative = true;
-
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = ( *a - *b ) * ( *a - *b );
-				}
-
-				static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
-					*c = ( *a - *c ) * ( *a - *c );
-				}
-
-				static void foldl( const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = ( *c - *b ) * ( *c - *b );
-				}
 			};
 
 			/**
-			 * left operand of type IN1,
-			 * right operand of type IN2
-			 * result of type std::pair< IN1, IN2 >
+			 * The greater-than-or-equal operator.
 			 *
-			 * for use together with argmin
+			 * Assumes that the >= operator for the given input types is defined.
 			 */
-			template< typename IN1, typename IN2, enum Backend implementation = config::default_backend >
-			class zip {
-			public:
-				typedef IN1 left_type;
-				typedef IN2 right_type;
-				typedef std::pair< IN1, IN2 > result_type;
-
-				static constexpr bool has_foldl = false;
-				static constexpr bool has_foldr = false;
-				static constexpr bool is_associative = false;
-				static constexpr bool is_commutative = false;
-
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					*c = std::make_pair( *a, *b );
-				}
-			};
+			template<
+				typename IN1, typename IN2, typename OUT,
+				enum Backend implementation = config::default_backend
+			>
+			class geq {
+
+				public:
+
+					/** Alias to the left-hand input data type. */
+					typedef IN1 left_type;
+
+					/** Alias to the right-hand input data type. */
+					typedef IN2 right_type;
+
+					/** Alias to the output data type. */
+					typedef OUT result_type;
+
+					/** Whether this operator has an in-place foldl. */
+					static constexpr bool has_foldl = true;
+
+					/** Whether this operator has an in-place foldr. */
+					static constexpr bool has_foldr = true;
+
+					/**
+					 * Whether this operator is \em mathematically associative; that is,
+					 * associative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_associative = false;
+
+					/**
+					 * Whether this operator is \em mathematically commutative; that is,
+					 * commutative when assuming equivalent data types for \a IN1, \a IN2,
+					 * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+					 */
+					static constexpr bool is_commutative = false;
+
+					/**
+					 * Out-of-place application of this operator.
+					 *
+					 * @param[in]  a The left-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[in]  b The right-hand side input. Must be pre-allocated and
+					 *               initialised.
+					 * @param[out] c The output. Must be pre-allocated.
+					 *
+					 * At the end of the operation, \f$ c \f$ will be set to
+					 * <tt>static_cast<result_type>(true)</tt> if \f$ a \geq b \f$, and to
+					 * <tt>static_cast<result_type>(false)</tt> otherwise.
+					 */
+					static void apply(
+						const left_type * __restrict__ const a,
+						const right_type * __restrict__ const b,
+						result_type * __restrict__ const c
+					) {
+						if( *a >= *b ) {
+							*c = static_cast< OUT >( true );
+						} else {
+							*c = static_cast< OUT >( false );
+						}
+					}
+
+					/**
+					 * In-place left-to-right folding.
+					 *
+					 * @param[in]     a Pointer to the left-hand side input data.
+					 * @param[in,out] c Pointer to the right-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 */
+					static void foldr(
+						const left_type * __restrict__ const a,
+						result_type * __restrict__ const c
+					) {
+						if( *a >= *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
+
+					/**
+					 * In-place right-to-left folding.
+					 *
+					 * @param[in,out] c Pointer to the left-hand side input data. This also
+					 *                  dubs as the output memory area.
+					 * @param[in]     b Pointer to the right-hand side input data.
+					 */
+					static void foldl(
+						result_type * __restrict__ const c,
+						const right_type * __restrict__ const b
+					) {
+						if( *b >= *c ) {
+							*c = static_cast< result_type >( true );
+						} else {
+							*c = static_cast< result_type >( false );
+						}
+					}
 
-			/**
-			 * compares the first argument of a pair
-			 */
-			template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
-			class equal_first {
-			public:
-				typedef IN1 left_type;
-
-				typedef IN2 right_type;
-
-				typedef OUT result_type;
-
-				static constexpr bool has_foldl = false;
-				static constexpr bool has_foldr = false;
-				static constexpr bool is_associative = false;
-				static constexpr bool is_commutative = false;
-
-				/**
-				 * Out-of-place application of this operator.
-				 *
-				 * @param[in]  a The left-hand side input. Must be pre-allocated and initialised.
-				 * @param[in]  b The right-hand side input. Must be pre-allocated and initialised.
-				 * @param[out] c The output. Must be pre-allocated.
-				 *
-				 * At the end of the operation, \f$ c = a->first == b->first \f$.
-				 */
-				static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
-					if( a->first == b->first ) {
-						*c = static_cast< OUT >( true );
-					} else {
-						*c = static_cast< OUT >( false );
-					}
-				}
 			};
 
 			/**
@@ -1847,67 +2706,75 @@ namespace grb {
 			template< typename OP, enum Backend implementation = config::default_backend >
 			class OperatorBase {
 
-			protected:
-				/** The block size that should be used during map-like operations. */
-				static constexpr size_t blocksize = grb::utils::static_min( grb::config::SIMD_BLOCKSIZE< typename OP::left_type >::value(),
-					grb::utils::static_min( grb::config::SIMD_BLOCKSIZE< typename OP::right_type >::value(), grb::config::SIMD_BLOCKSIZE< typename OP::result_type >::value() ) );
-
-				/** The left-hand side input domain. */
-				typedef typename OP::left_type D1;
-
-				/** The right-hand side input domain. */
-				typedef typename OP::right_type D2;
-
-				/** The output domain. */
-				typedef typename OP::result_type D3;
-
-			public:
-				/** @return Whether this operator is mathematically associative. */
-				static constexpr bool is_associative() {
-					return OP::is_associative;
-				}
-
-				/** @return Whether this operator is mathematically commutative. */
-				static constexpr bool is_commutative() {
-					return OP::is_commutative;
-				}
-
-				/**
-				 * Straightforward application of this operator. Computes \f$ x \odot y \f$
-				 * and stores the result in \a z.
-				 *
-				 * @tparam InputType1 The type of the input parameter \a x.
-				 * @tparam InputType2 The type of the input parameter \a y.
-				 * @tparam OutputType The type of the output parameter \a z.
-				 *
-				 * \warning If \a InputType1 does not match \a D! \em or \a InputType2 does
-				 *          not match \a D2 \em or \a OutputType does not match \a D3, then
-				 *          the input will be cast into temporary variables of the correct
-				 *          types, while the output will be cast from a temporary variable,
-				 *
-				 * \note Best performance is thus only guaranteed when all domains match.
-				 *
-				 * @param[in]  x The left-hand side input.
-				 * @param[in]  y The right-hand side input.
-				 * @param[out] z The output element.
-				 */
-				template< typename InputType1, typename InputType2, typename OutputType >
-				static void apply( const InputType1 & x, const InputType2 & y, OutputType & z ) {
-					const D1 a = static_cast< D1 >( x );
-					const D2 b = static_cast< D2 >( y );
-					D3 temp;
-					OP::apply( &a, &b, &temp );
-					z = static_cast< OutputType >( temp );
-				}
-
-				/**
-				 * This is the high-performance version of apply() in the sense that no
-				 * casting is required. This version will be automatically caled whenever
-				 * possible.
-				 */
-				static void apply( const D1 & x, const D2 & y, D3 & out ) {
-					OP::apply( &x, &y, &out );
-				}
+				protected:
+
+					/** The block size that should be used during map-like operations. */
+					static constexpr size_t blocksize = grb::utils::static_min(
+						grb::config::SIMD_BLOCKSIZE< typename OP::left_type >::value(),
+						grb::utils::static_min(
+							grb::config::SIMD_BLOCKSIZE< typename OP::right_type >::value(),
+							grb::config::SIMD_BLOCKSIZE< typename OP::result_type >::value()
+						)
+					);
+
+					/** The left-hand side input domain. */
+					typedef typename OP::left_type D1;
+
+					/** The right-hand side input domain. */
+					typedef typename OP::right_type D2;
+
+					/** The output domain. */
+					typedef typename OP::result_type D3;
+
+				public:
+
+					/** @return Whether this operator is mathematically associative. */
+					static constexpr bool is_associative() {
+						return OP::is_associative;
+					}
+
+					/** @return Whether this operator is mathematically commutative. */
+					static constexpr bool is_commutative() {
+						return OP::is_commutative;
+					}
+
+					/**
+					 * Straightforward application of this operator. Computes \f$ x \odot y \f$
+					 * and stores the result in \a z.
+					 *
+					 * @tparam InputType1 The type of the input parameter \a x.
+					 * @tparam InputType2 The type of the input parameter \a y.
+					 * @tparam OutputType The type of the output parameter \a z.
+					 *
+					 * \warning If \a InputType1 does not match \a D! \em or \a InputType2 does
+					 *          not match \a D2 \em or \a OutputType does not match \a D3, then
+					 *          the input will be cast into temporary variables of the correct
+					 *          types, while the output will be cast from a temporary variable,
+					 *
+					 * \note Best performance is thus only guaranteed when all domains match.
+					 *
+					 * @param[in]  x The left-hand side input.
+					 * @param[in]  y The right-hand side input.
+					 * @param[out] z The output element.
+					 */
+					template< typename InputType1, typename InputType2, typename OutputType >
+					static void apply( const InputType1 & x, const InputType2 & y, OutputType & z ) {
+						const D1 a = static_cast< D1 >( x );
+						const D2 b = static_cast< D2 >( y );
+						D3 temp;
+						OP::apply( &a, &b, &temp );
+						z = static_cast< OutputType >( temp );
+					}
+
+					/**
+					 * This is the high-performance version of apply() in the sense that no
+					 * casting is required. This version will be automatically caled whenever
+					 * possible.
+					 */
+					static void apply( const D1 & x, const D2 & y, D3 & out ) {
+						OP::apply( &x, &y, &out );
+					}
+
 			};
 
 			/**
@@ -1936,156 +2803,170 @@ namespace grb {
 			 * @see Operator for full details.
 			 * @see OperatorBase for additional functions exposed to the final operator.
 			 */
-			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			template<
+				typename OP, typename guard = void,
+				enum Backend implementation = config::default_backend
+			>
 			class OperatorFR : public OperatorBase< OP > {
 
-			public:
-				/**
-				 * Emulated in-place application of this operator on two data elements.
-				 *
-				 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
-				 *
-				 * We wish to call this in-place variant internally for brevity. However,
-				 * if \a OP has no in-place variant, then we must cache the previous
-				 * value of the output element or otherwise we will breach the
-				 * __restrict__ contract of OP::apply.
-				 * The caller must ensure the appropriate domains and casting behaviour
-				 * is applicable. Note that a user is never to call these functions
-				 * explicitly.
-				 *
-				 * @tparam InputType The type of the parameter \a x.
-				 * @tparam IOType    The type of the parameter \a y.
-				 *
-				 * \warning Additional casting and use of temporary variables may occur
-				 *          when \a InputType does not match \a D1 \em or \a IOType
-				 *          does not match \a D3.
-				 *
-				 * \note This implementation relies on apply().
-				 *
-				 * @param[in]     x The value that is to be applied to \a y.
-				 * @param[in,out] y The value \a x is to be applied against.
-				 */
-				template< typename InputType, typename IOType >
-				static void foldr( const InputType & x, IOType & y ) {
-					typedef typename OperatorBase< OP >::D2 D2;
-					const D2 cache = static_cast< D2 >( y );
-					OperatorBase< OP >::apply( x, cache, y );
-				}
-
-				/**
-				 * Out-of-place element-wise foldr function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot z_i \f$ and stores the result into
-				 * \f$ z_i \f$.
-				 *
-				 * @tparam InputType The type of elements in \a x.
-				 * @tparam IOType    The type of elements in \a z.
-				 *
-				 * @param x The left-hand side input data.
-				 * @param z Where \a x shall be mapped into.
-				 * @param n How many data elements \a x and \a z contain.
-				 *
-				 * This version requires three buffers, streams \a x once,
-				 * and streams \a z twice (once for reading, once for
-				 * writing.
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
-					// local buffers
-					typedef typename OperatorBase< OP >::D1 D1;
-					typedef typename OperatorBase< OP >::D2 D2;
-					typedef typename OperatorBase< OP >::D3 D3;
-					D1 left_buffer[ OperatorBase< OP >::blocksize ];
-					D2 right_buffer[ OperatorBase< OP >::blocksize ];
-					D3 result_buffer[ OperatorBase< OP >::blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + OperatorBase< OP >::blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-							right_buffer[ b ] = static_cast< D2 >( z[ i ] );
-						}
-
-						// rewind source and output
-						i -= OperatorBase< OP >::blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
-							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
-						OP::apply( left_buffer, right_buffer, result_buffer );
-						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
-
-				/**
-				 * Out-of-place element-wise foldr function. Calculates
-				 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x \odot z_i \f$ and stores the result into
-				 * \f$ z_i \f$.
-				 *
-				 * @tparam InputType The type of elements in \a x.
-				 * @tparam IOType    The type of elements in \a z.
-				 *
-				 * @param x The left-hand side input value.
-				 * @param z Where \a x shall be mapped into.
-				 * @param n How many data elements \a z contains.
-				 *
-				 * This version requires two buffers and streams \a z
-				 * twice (once for reading, once for writing).
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
-					// local buffers
-					typedef typename OperatorBase< OP >::D1 D1;
-					typedef typename OperatorBase< OP >::D2 D2;
-					typedef typename OperatorBase< OP >::D3 D3;
-					const D1 left_buffer = x; // this is actually mandatory in case x is a temporary
-					D2 right_buffer[ OperatorBase< OP >::blocksize ];
-					D3 result_buffer[ OperatorBase< OP >::blocksize ];
+				public:
+					/**
+					 * Emulated in-place application of this operator on two data elements.
+					 *
+					 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+					 *
+					 * We wish to call this in-place variant internally for brevity. However,
+					 * if \a OP has no in-place variant, then we must cache the previous
+					 * value of the output element or otherwise we will breach the
+					 * __restrict__ contract of OP::apply.
+					 * The caller must ensure the appropriate domains and casting behaviour
+					 * is applicable. Note that a user is never to call these functions
+					 * explicitly.
+					 *
+					 * @tparam InputType The type of the parameter \a x.
+					 * @tparam IOType    The type of the parameter \a y.
+					 *
+					 * \warning Additional casting and use of temporary variables may occur
+					 *          when \a InputType does not match \a D1 \em or \a IOType
+					 *          does not match \a D3.
+					 *
+					 * \note This implementation relies on apply().
+					 *
+					 * @param[in]     x The value that is to be applied to \a y.
+					 * @param[in,out] y The value \a x is to be applied against.
+					 */
+					template< typename InputType, typename IOType >
+					static void foldr( const InputType & x, IOType & y ) {
+						typedef typename OperatorBase< OP >::D2 D2;
+						const D2 cache = static_cast< D2 >( y );
+						OperatorBase< OP >::apply( x, cache, y );
+					}
 
-					// blockwise application
-					size_t i = 0;
-					while( i + OperatorBase< OP >::blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
-							right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+					/**
+					 * Out-of-place element-wise foldr function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot z_i \f$ and stores the result into
+					 * \f$ z_i \f$.
+					 *
+					 * @tparam InputType The type of elements in \a x.
+					 * @tparam IOType    The type of elements in \a z.
+					 *
+					 * @param x The left-hand side input data.
+					 * @param z Where \a x shall be mapped into.
+					 * @param n How many data elements \a x and \a z contain.
+					 *
+					 * This version requires three buffers, streams \a x once,
+					 * and streams \a z twice (once for reading, once for
+					 * writing.
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldrAA(
+						const InputType * __restrict__ const x,
+						IOType * __restrict__ const z,
+						const size_t n
+					) {
+						// local buffers
+						typedef typename OperatorBase< OP >::D1 D1;
+						typedef typename OperatorBase< OP >::D2 D2;
+						typedef typename OperatorBase< OP >::D3 D3;
+						D1 left_buffer[ OperatorBase< OP >::blocksize ];
+						D2 right_buffer[ OperatorBase< OP >::blocksize ];
+						D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + OperatorBase< OP >::blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+								right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+							}
+
+							// rewind source and output
+							i -= OperatorBase< OP >::blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+								OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+									&( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
 						}
 
-						// rewind source and output
-						i -= OperatorBase< OP >::blocksize;
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+							OP::apply( left_buffer, right_buffer, result_buffer );
+							z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
 
-						// operate within buffer
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
-							OP::apply( &left_buffer, &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+					/**
+					 * Out-of-place element-wise foldr function. Calculates
+					 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x \odot z_i \f$ and stores the result into
+					 * \f$ z_i \f$.
+					 *
+					 * @tparam InputType The type of elements in \a x.
+					 * @tparam IOType    The type of elements in \a z.
+					 *
+					 * @param x The left-hand side input value.
+					 * @param z Where \a x shall be mapped into.
+					 * @param n How many data elements \a z contains.
+					 *
+					 * This version requires two buffers and streams \a z
+					 * twice (once for reading, once for writing).
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldrSA(
+						const InputType x, IOType * __restrict__ const z,
+						const size_t n
+					) {
+						// local buffers
+						typedef typename OperatorBase< OP >::D1 D1;
+						typedef typename OperatorBase< OP >::D2 D2;
+						typedef typename OperatorBase< OP >::D3 D3;
+						const D1 left_buffer = x; // this is actually mandatory in case x is a
+						                          // temporary
+						D2 right_buffer[ OperatorBase< OP >::blocksize ];
+						D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + OperatorBase< OP >::blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+								right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+							}
+
+							// rewind source and output
+							i -= OperatorBase< OP >::blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+								OP::apply( &left_buffer, &( right_buffer[ b ] ),
+									&( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
 						}
 
-						// write back result
-						for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+							OP::apply( &left_buffer, right_buffer, result_buffer );
+							z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
 						}
 					}
 
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
-						OP::apply( &left_buffer, right_buffer, result_buffer );
-						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
 			};
 
 			/**
@@ -2105,223 +2986,244 @@ namespace grb {
 			 * @see OperatorBase for additional functions exposed to the final operator.
 			 */
 			template< typename OP >
-			class OperatorFR< OP, typename std::enable_if< OP::has_foldr && std::is_same< typename OP::right_type, typename OP::result_type >::value >::type > : public OperatorBase< OP > {
-
-			private:
-				typedef typename OperatorBase< OP >::D1 D1;
-				typedef typename OperatorBase< OP >::D3 D3;
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-			public:
-				/**
-				 * In-place application of this operator on two data elements.
-				 *
-				 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
-				 *
-				 * \note This variant is only called when the underlying raw operator
-				 *       supports in-place operations.
-				 *
-				 * The caller must ensure the appropriate domains and casting behaviour
-				 * is applicable. Note that a user is never to call these functions
-				 * explicitly.
-				 *
-				 * @param[in]     x The value that is to be applied to \a y.
-				 * @param[in,out] y The value \a x is to be applied against.
-				 */
-				static void foldr( const D1 & x, D3 & y ) {
-					OP::foldr( &x, &y );
-				}
-
-				/**
-				 * In-place element-wise foldr function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x \odot z_i \f$ and stores the result into \f$ z_i \f$.
-				 *
-				 * @tparam InputType The type of \a x.
-				 * @tparam IOType    The type of elements in \a z.
-				 *
-				 * @param[in]     x The left-hand side input value.
-				 * @param[in,out] z Where \a x shall be mapped into.
-				 * @param[in]     n How many data elements \a z contains.
-				 *
-				 * This implementation requires one buffers only. It streams \a z twice,
-				 * once for reading, once for writing. This function should vectorise.
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
-					// local buffers
-					const D1 left_buffer = static_cast< D1 >( x );
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							result_buffer[ b ] = static_cast< D3 >( z[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldr( &left_buffer, &( result_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
-						OP::foldr( &left_buffer, result_buffer );
-						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
-
-				/**
-				 * In-place element-wise foldr function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot z_i \f$ and stores the result into \f$ z_i \f$.
-				 *
-				 * @tparam InputType The type of elements in \a x.
-				 * @tparam IOType    The type of elements in \a z.
-				 *
-				 * @param[in]     x The left-hand side input data.
-				 * @param[in,out] z Where \a x shall be mapped into.
-				 * @param[in]     n How many data elements \a x and \a z contain.
-				 *
-				 * This implementation requires two buffers only. It streams \a x once,
-				 * while streaming \a z twice (once for reading, once for writing). This
-				 * function should vectorise.
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
-					// local buffers
-					D1 left_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-							result_buffer[ b ] = static_cast< D3 >( z[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
-						OP::foldr( left_buffer, result_buffer );
-						z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
-
-				/**
-				 * In-place element-wise apply function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ z_i = x_i \odot y_i \f$.
-				 *
-				 * @tparam InputType1 The type of elements in \a x.
-				 * @tparam InputType2 The type of elements in \a y.
-				 * @tparam OutputType The type of elements in \a z.
-				 *
-				 * If \a InputType2 and \a D3 are not the same, then the existing data in
-				 * \a y is cast to \a D3 prior to application of this in-place operator.
-				 * If \a InputType1 and \a D1 are not the same, then the existing data in
-				 * \a x are cast to \a D1 prior to application of this in-place operator.
-				 * If \a OutputType and \a D3 are not the same, then the results of
-				 * applying this operator are cast to \a OutputType prior to writing back
-				 * the results.
-				 *
-				 * \warning The first casting behaviour may not be what you want. The two
-				 *          other casting behaviours are allowed by the GraphBLAS unless
-				 *          the grb::descriptor::no_casting is given.
-				 *
-				 * \note By default, this GraphBLAS implementation will only use this
-				 *       code when \a D2 matches \a D3 and OP::has_foldr is \a true.
-				 *
-				 * This implementation relies on an in-place foldr().
-				 *
-				 * @param[in]  x The left-hand side input data. The memory range starting
-				 *               at \a x and ending at \a x + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n (exclusive).
-				 * @param[in]  y The right-hand side input data. The memory range starting
-				 *               at \a y and ending at \a y + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n.
-				 * @param[out] z Where the map of \a x into \a y must be stored. This
-				 *               pointer is restricted in the sense that its memory may
-				 *               never overlap with those pointed to by \a x or \y, as
-				 *               detailed above.
-				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
-				 */
-				template< typename InputType1, typename InputType2, typename OutputType >
-				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
-#ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
-					std::cout << "In OperatorFR::eWiseApply\n";
-#endif
-#endif
-					// NOTE: this variant is only active when the computation can be done using two buffers only
+			class OperatorFR<
+				OP,
+				typename std::enable_if<
+					OP::has_foldr &&
+					std::is_same< typename OP::right_type, typename OP::result_type >::value
+				>::type
+			> : public OperatorBase< OP > {
 
-					// local buffers
-					D1 left_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
+				private:
 
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
+					typedef typename OperatorBase< OP >::D1 D1;
+					typedef typename OperatorBase< OP >::D3 D3;
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+				public:
+
+					/**
+					 * In-place application of this operator on two data elements.
+					 *
+					 * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+					 *
+					 * \note This variant is only called when the underlying raw operator
+					 *       supports in-place operations.
+					 *
+					 * The caller must ensure the appropriate domains and casting behaviour
+					 * is applicable. Note that a user is never to call these functions
+					 * explicitly.
+					 *
+					 * @param[in]     x The value that is to be applied to \a y.
+					 * @param[in,out] y The value \a x is to be applied against.
+					 */
+					static void foldr( const D1 & x, D3 & y ) {
+						OP::foldr( &x, &y );
+					}
 
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-							result_buffer[ b ] = static_cast< D3 >( y[ i ] );
+					/**
+					 * In-place element-wise foldr function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x \odot z_i \f$ and stores the result into \f$ z_i \f$.
+					 *
+					 * @tparam InputType The type of \a x.
+					 * @tparam IOType    The type of elements in \a z.
+					 *
+					 * @param[in]     x The left-hand side input value.
+					 * @param[in,out] z Where \a x shall be mapped into.
+					 * @param[in]     n How many data elements \a z contains.
+					 *
+					 * This implementation requires one buffers only. It streams \a z twice,
+					 * once for reading, once for writing. This function should vectorise.
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldrSA(
+						const InputType x, IOType * __restrict__ const z,
+						const size_t n
+					) {
+						// local buffers
+						const D1 left_buffer = static_cast< D1 >( x );
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldr( &left_buffer, &( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
 						}
 
-						// rewind source and output
-						i -= blocksize;
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+							OP::foldr( &left_buffer, result_buffer );
+							z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
 
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+					/**
+					 * In-place element-wise foldr function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot z_i \f$ and stores the result into \f$ z_i \f$.
+					 *
+					 * @tparam InputType The type of elements in \a x.
+					 * @tparam IOType    The type of elements in \a z.
+					 *
+					 * @param[in]     x The left-hand side input data.
+					 * @param[in,out] z Where \a x shall be mapped into.
+					 * @param[in]     n How many data elements \a x and \a z contain.
+					 *
+					 * This implementation requires two buffers only. It streams \a x once,
+					 * while streaming \a z twice (once for reading, once for writing). This
+					 * function should vectorise.
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldrAA(
+						const InputType * __restrict__ const x,
+						IOType * __restrict__ const z,
+						const size_t n
+					) {
+						// local buffers
+						D1 left_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+								result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
 						}
 
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+							OP::foldr( left_buffer, result_buffer );
+							z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
 						}
 					}
 
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< typename OP::left_type >( x[ i ] );
-						result_buffer[ 0 ] = static_cast< typename OP::result_type >( y[ i ] );
-						OP::foldr( left_buffer, result_buffer );
-						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+					/**
+					 * In-place element-wise apply function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ z_i = x_i \odot y_i \f$.
+					 *
+					 * @tparam InputType1 The type of elements in \a x.
+					 * @tparam InputType2 The type of elements in \a y.
+					 * @tparam OutputType The type of elements in \a z.
+					 *
+					 * If \a InputType2 and \a D3 are not the same, then the existing data in
+					 * \a y is cast to \a D3 prior to application of this in-place operator.
+					 * If \a InputType1 and \a D1 are not the same, then the existing data in
+					 * \a x are cast to \a D1 prior to application of this in-place operator.
+					 * If \a OutputType and \a D3 are not the same, then the results of
+					 * applying this operator are cast to \a OutputType prior to writing back
+					 * the results.
+					 *
+					 * \warning The first casting behaviour may not be what you want. The two
+					 *          other casting behaviours are allowed by the GraphBLAS unless
+					 *          the grb::descriptor::no_casting is given.
+					 *
+					 * \note By default, this GraphBLAS implementation will only use this
+					 *       code when \a D2 matches \a D3 and OP::has_foldr is \a true.
+					 *
+					 * This implementation relies on an in-place foldr().
+					 *
+					 * @param[in]  x The left-hand side input data. The memory range starting
+					 *               at \a x and ending at \a x + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n (exclusive).
+					 * @param[in]  y The right-hand side input data. The memory range starting
+					 *               at \a y and ending at \a y + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n.
+					 * @param[out] z Where the map of \a x into \a y must be stored. This
+					 *               pointer is restricted in the sense that its memory may
+					 *               never overlap with those pointed to by \a x or \y, as
+					 *               detailed above.
+					 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+					 */
+					template< typename InputType1, typename InputType2, typename OutputType >
+					static void eWiseApply(
+						const InputType1 * x,
+						const InputType2 * y,
+						OutputType * __restrict__ z,
+						const size_t n
+					) {
+#ifdef _DEBUG
+ #ifdef D_GRB_NO_STDIO
+						std::cout << "In OperatorFR::eWiseApply\n";
+ #endif
+#endif
+						// NOTE: this variant is only active when the computation can be done using two buffers only
+
+						// local buffers
+						D1 left_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+								result_buffer[ b ] = static_cast< D3 >( y[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+							}
+						}
+
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< typename OP::left_type >( x[ i ] );
+							result_buffer[ 0 ] = static_cast< typename OP::result_type >( y[ i ] );
+							OP::foldr( left_buffer, result_buffer );
+							z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+						}
 					}
-				}
+
 			};
 
 			/**
@@ -2350,156 +3252,170 @@ namespace grb {
 			 * @see OperatorBase for additional functions exposed to the resulting
 			 *                   operator.
 			 */
-			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			template<
+				typename OP, typename guard = void,
+				enum Backend implementation = config::default_backend
+			>
 			class OperatorFL : public OperatorFR< OP > {
 
-			private:
-			public:
-				typedef typename OperatorBase< OP >::D1 D1;
-				typedef typename OperatorBase< OP >::D2 D2;
-				typedef typename OperatorBase< OP >::D3 D3;
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-				/**
-				 * Emulated in-place application of this operator on two data elements.
-				 *
-				 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
-				 *
-				 * We wish to call this in-place variant internally for brevity. However,
-				 * if \a OP has no in-place variant, then we must cache the previous
-				 * value of the output element or otherwise we will breach the
-				 * __restrict__ contract of OP::apply.
-				 * The caller must ensure the appropriate domains and casting behaviour
-				 * is applicable. Note that a user is never to call these functions
-				 * explicitly.
-				 *
-				 * @tparam InputType The type of the parameter \a x.
-				 * @tparam IOType    The type of the parameter \a y.
-				 *
-				 * \warning Additional casting and use of temporary variables may occur
-				 *          when \a InputType does not match \a D2 \em or \a IOType
-				 *          does not match \a D3.
-				 *
-				 * \note This implementation relies on apply().
-				 *
-				 * @param[in,out] x The value \a y is to be applied against.
-				 * @param[in]     y The value that is to be applied to \a x.
-				 */
-				template< typename InputType, typename IOType >
-				static void foldl( IOType & x, const InputType & y ) {
-					const D1 cache = static_cast< D1 >( x );
-					OperatorBase< OP >::apply( cache, y, x );
-				}
-
-				/**
-				 * Out-of-place element-wise foldl function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
-				 *
-				 * @tparam IOType    The type of elements in \a x.
-				 * @tparam InputType The type of \a y.
-				 *
-				 * @param[in, out] x At function entry, the left-hand side input data.
-				 *                   At function exit, the output data as defined above.
-				 * @param[in]      y The right-hand side input value.
-				 * @param[in]      n How many data elements \a x contains.
-				 *
-				 * This version requires two buffers and streams \a x twice (once for
-				 * reading, once for writing). This function should vectorise its
-				 * out-of-place operations.
-				 */
-				template< typename IOType, typename InputType >
-				static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
-					// local buffers
-					D1 left_buffer[ blocksize ];
-					const D2 right_buffer = y;
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::apply( &( left_buffer[ b ] ), &right_buffer, &( result_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						OP::apply( left_buffer, &right_buffer, result_buffer );
-						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
-
-				/**
-				 * Out-of-place element-wise foldl function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
-				 *
-				 * @tparam IOType    The type of elements in \a x.
-				 * @tparam InputType The type of elements in \a y.
-				 *
-				 * @param[in, out] x At function entry, the left-hand side input data.
-				 *                   At function exit, the output data as defined above.
-				 * @param[in]      y The right-hand side input.
-				 * @param[in]      n How many data elements \a x and \a y contain.
-				 *
-				 * This version requires three buffers, streams \a y once, and streams
-				 * \a x twice (once for reading, once for writing). This function should
-				 * vectorise its out-of-place operations.
-				 */
-				template< typename IOType, typename InputType >
-				static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
-					// local buffers
-					D1 left_buffer[ blocksize ];
-					D2 right_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
-						OP::apply( left_buffer, right_buffer, result_buffer );
-						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
+				public:
+
+					typedef typename OperatorBase< OP >::D1 D1;
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+					/**
+					 * Emulated in-place application of this operator on two data elements.
+					 *
+					 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+					 *
+					 * We wish to call this in-place variant internally for brevity. However,
+					 * if \a OP has no in-place variant, then we must cache the previous
+					 * value of the output element or otherwise we will breach the
+					 * __restrict__ contract of OP::apply.
+					 * The caller must ensure the appropriate domains and casting behaviour
+					 * is applicable. Note that a user is never to call these functions
+					 * explicitly.
+					 *
+					 * @tparam InputType The type of the parameter \a x.
+					 * @tparam IOType    The type of the parameter \a y.
+					 *
+					 * \warning Additional casting and use of temporary variables may occur
+					 *          when \a InputType does not match \a D2 \em or \a IOType
+					 *          does not match \a D3.
+					 *
+					 * \note This implementation relies on apply().
+					 *
+					 * @param[in,out] x The value \a y is to be applied against.
+					 * @param[in]     y The value that is to be applied to \a x.
+					 */
+					template< typename InputType, typename IOType >
+					static void foldl( IOType &x, const InputType &y ) {
+						const D1 cache = static_cast< D1 >( x );
+						OperatorBase< OP >::apply( cache, y, x );
+					}
+
+					/**
+					 * Out-of-place element-wise foldl function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+					 *
+					 * @tparam IOType    The type of elements in \a x.
+					 * @tparam InputType The type of \a y.
+					 *
+					 * @param[in, out] x At function entry, the left-hand side input data.
+					 *                   At function exit, the output data as defined above.
+					 * @param[in]      y The right-hand side input value.
+					 * @param[in]      n How many data elements \a x contains.
+					 *
+					 * This version requires two buffers and streams \a x twice (once for
+					 * reading, once for writing). This function should vectorise its
+					 * out-of-place operations.
+					 */
+					template< typename IOType, typename InputType >
+					static void eWiseFoldlAS(
+						IOType * __restrict__ const x,
+						const InputType y,
+						const size_t n
+					) {
+						// local buffers
+						D1 left_buffer[ blocksize ];
+						const D2 right_buffer = y;
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::apply( &( left_buffer[ b ] ), &right_buffer,
+									&( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
+						}
+
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							OP::apply( left_buffer, &right_buffer, result_buffer );
+							x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
+
+					/**
+					 * Out-of-place element-wise foldl function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+					 *
+					 * @tparam IOType    The type of elements in \a x.
+					 * @tparam InputType The type of elements in \a y.
+					 *
+					 * @param[in, out] x At function entry, the left-hand side input data.
+					 *                   At function exit, the output data as defined above.
+					 * @param[in]      y The right-hand side input.
+					 * @param[in]      n How many data elements \a x and \a y contain.
+					 *
+					 * This version requires three buffers, streams \a y once, and streams
+					 * \a x twice (once for reading, once for writing). This function should
+					 * vectorise its out-of-place operations.
+					 */
+					template< typename IOType, typename InputType >
+					static void eWiseFoldlAA(
+						IOType * __restrict__ const x,
+						const InputType * __restrict__ const y,
+						const size_t n
+					) {
+						// local buffers
+						D1 left_buffer[ blocksize ];
+						D2 right_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+								right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+									&( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
+						}
+
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+							OP::apply( left_buffer, right_buffer, result_buffer );
+							x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
+
 			};
 
 			/**
@@ -2517,139 +3433,154 @@ namespace grb {
 			 *                   operator.
 			 */
 			template< typename OP >
-			class OperatorFL< OP, typename std::enable_if< OP::has_foldl && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > : public OperatorFR< OP > {
-
-			private:
-			public:
-				typedef typename OperatorBase< OP >::D2 D2;
-				typedef typename OperatorBase< OP >::D3 D3;
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-				/**
-				 * In-place application of this operator on two data elements.
-				 *
-				 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
-				 *
-				 * \note This variant is only called when the underlying raw operator
-				 *       supports in-place operations.
-				 *
-				 * The caller must ensure the appropriate domains and casting behaviour
-				 * is applicable. Note that a user is never to call these functions
-				 * explicitly.
-				 *
-				 * @param[in,out] x The value \a y is to be applied against.
-				 * @param[in]     y The value that is to be applied to \a x.
-				 */
-				static void foldl( D3 & x, const D2 & y ) {
-					OP::foldl( &x, &y );
-				}
-
-				/**
-				 * In-place element-wise foldl function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
-				 *
-				 * @tparam IOType    The type of elements in \a x.
-				 * @tparam InputType The type of elements in \a y.
-				 *
-				 * @param[in,out] x At function extry: the left-hand side input data.
-				 *                  At function exit: the result data.
-				 * @param[in]     y The right-hand side input data.
-				 * @param[in]     n How many data elements \a x and \a y contain.
-				 *
-				 * This implementation requires two buffers only. It streams \a y once,
-				 * while streaming \a x twice (once for reading, once for writing). This
-				 * function should vectorise.
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
-					// local buffers
-					D2 right_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
-							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
-						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
-						OP::foldl( result_buffer, right_buffer );
-						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
-
-				/**
-				 * In-place element-wise foldl function. Calculates
-				 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
-				 *
-				 * @tparam IOType    The type of elements in \a x.
-				 * @tparam InputType The type of \a y.
-				 *
-				 * @param[in,out] x At function extry: the left-hand side input data.
-				 *                  At function exit: the result data.
-				 * @param[in]     y The right-hand side input value.
-				 * @param[in]     n How many data elements \a x contains.
-				 *
-				 * This implementation requires one buffers only. It streams \a x twice
-				 * (once for reading, once for writing). This function should vectorise.
-				 */
-				template< typename InputType, typename IOType >
-				static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
-					// local buffers
-					const D2 right_buffer = static_cast< D2 >( y );
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldl( &( result_buffer[ b ] ), &right_buffer );
-						}
-
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							x[ i ] = static_cast< IOType >( result_buffer[ b ] );
-						}
-					}
-
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
-						OP::foldl( result_buffer, &right_buffer );
-						x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
-					}
-				}
+			class OperatorFL<
+				OP,
+				typename std::enable_if<
+					OP::has_foldl &&
+					std::is_same< typename OP::left_type, typename OP::result_type >::value
+				>::type
+			> : public OperatorFR< OP > {
+
+				public:
+
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+					/**
+					 * In-place application of this operator on two data elements.
+					 *
+					 * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+					 *
+					 * \note This variant is only called when the underlying raw operator
+					 *       supports in-place operations.
+					 *
+					 * The caller must ensure the appropriate domains and casting behaviour
+					 * is applicable. Note that a user is never to call these functions
+					 * explicitly.
+					 *
+					 * @param[in,out] x The value \a y is to be applied against.
+					 * @param[in]     y The value that is to be applied to \a x.
+					 */
+					static void foldl( D3 &x, const D2 &y ) {
+						OP::foldl( &x, &y );
+					}
+
+					/**
+					 * In-place element-wise foldl function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+					 *
+					 * @tparam IOType    The type of elements in \a x.
+					 * @tparam InputType The type of elements in \a y.
+					 *
+					 * @param[in,out] x At function extry: the left-hand side input data.
+					 *                  At function exit: the result data.
+					 * @param[in]     y The right-hand side input data.
+					 * @param[in]     n How many data elements \a x and \a y contain.
+					 *
+					 * This implementation requires two buffers only. It streams \a y once,
+					 * while streaming \a x twice (once for reading, once for writing). This
+					 * function should vectorise.
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldlAA(
+						IOType * __restrict__ const x,
+						const InputType * __restrict__ const y,
+						const size_t n
+					) {
+						// local buffers
+						D2 right_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+								result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
+						}
+
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+							result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+							OP::foldl( result_buffer, right_buffer );
+							x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
+
+					/**
+					 * In-place element-wise foldl function. Calculates
+					 * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+					 *
+					 * @tparam IOType    The type of elements in \a x.
+					 * @tparam InputType The type of \a y.
+					 *
+					 * @param[in,out] x At function extry: the left-hand side input data.
+					 *                  At function exit: the result data.
+					 * @param[in]     y The right-hand side input value.
+					 * @param[in]     n How many data elements \a x contains.
+					 *
+					 * This implementation requires one buffers only. It streams \a x twice
+					 * (once for reading, once for writing). This function should vectorise.
+					 */
+					template< typename InputType, typename IOType >
+					static void eWiseFoldlAS(
+						IOType * __restrict__ const x,
+						const InputType y,
+						const size_t n
+					) {
+						// local buffers
+						const D2 right_buffer = static_cast< D2 >( y );
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldl( &( result_buffer[ b ] ), &right_buffer );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+							}
+						}
+
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+							OP::foldl( result_buffer, &right_buffer );
+							x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+						}
+					}
+
 			};
 
 			/**
@@ -2677,7 +3608,10 @@ namespace grb {
 			 * @see OperatorBase for additional functions exposed to the resulting
 			 *                   operator.
 			 */
-			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			template<
+				typename OP, typename guard = void,
+				enum Backend implementation = config::default_backend
+			>
 			class OperatorNoFR : public OperatorFL< OP > {};
 
 			/**
@@ -2699,102 +3633,115 @@ namespace grb {
 			 *                   operator.
 			 */
 			template< typename OP >
-			class OperatorNoFR< OP, typename std::enable_if< OP::has_foldl && ! ( OP::has_foldr ) && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > :
-				public OperatorFL< OP > {
-
-			private:
-			public:
-				typedef typename OperatorBase< OP >::D2 D2;
-				typedef typename OperatorBase< OP >::D3 D3;
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-				/**
-				 * In-place element-wise apply function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ z_i = x_i \odot y_i \f$.
-				 *
-				 * @tparam InputType1 The type of elements in \a x.
-				 * @tparam InputType2 The type of elements in \a y.
-				 * @tparam OutputType The type of elements in \a z.
-				 *
-				 * If the \a InputType1 and \a D3 are not the same, then the existing data
-				 * in \a x is cast to \a D3 prior to application of this operator.
-				 * If \a InputType2 and \a D2 are not the same, then the existing data in
-				 * \a y is cast to \a D2 prior to application of this operator.
-				 * If \a OutputType and \a D3 are not the same, then the result of
-				 * applications of this operator are cast to \a OutputType prior to
-				 * writing it back to \a z.
-				 *
-				 * \warning The first casting behaviour may not be what you want. The two
-				 *          other casting behaviours are allowed by the GraphBLAS unless
-				 *          the grb::descriptor::no_casting is given.
-				 *
-				 * \note By default, this GraphBLAS implementation will only use this
-				 *       code when \a D1 matches \a D3 and OP::has_foldr is \a true.
-				 *       However, this implementation will never be enabled if \a D2
-				 *       equals \a D3 and OP::has_foldl is \a true.
-				 *
-				 * This implementation relies on an in-place foldl().
-				 *
-				 * @param[in]  x The left-hand side input data. The memory range starting
-				 *               at \a x and ending at \a x + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n (exclusive).
-				 * @param[in]  y The right-hand side input data. The memory range starting
-				 *               at \a y and ending at \a y + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n.
-				 * @param[out] z Where the map of \a x into \a y must be stored. This
-				 *               pointer is restricted in the sense that its memory may
-				 *               never overlap with those pointed to by \a x or \y, as
-				 *               detailed above.
-				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
-				 */
-				template< typename InputType1, typename InputType2, typename OutputType >
-				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
-#ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
-					std::cout << "In OperatorNoFR::eWiseApply\n";
-#endif
-#endif
-					// NOTE: this variant is only active when the computation can be done using two buffers only
-
-					// local buffers
-					D2 right_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
-							result_buffer[ b ] = static_cast< D3 >( x[ i ] );
-						}
+			class OperatorNoFR<
+				OP,
+				typename std::enable_if<
+					OP::has_foldl &&
+					!(OP::has_foldr) &&
+					std::is_same< typename OP::left_type, typename OP::result_type >::value
+				>::type
+			> : public OperatorFL< OP > {
 
-						// rewind source and output
-						i -= blocksize;
+				public:
 
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+					/**
+					 * In-place element-wise apply function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ z_i = x_i \odot y_i \f$.
+					 *
+					 * @tparam InputType1 The type of elements in \a x.
+					 * @tparam InputType2 The type of elements in \a y.
+					 * @tparam OutputType The type of elements in \a z.
+					 *
+					 * If the \a InputType1 and \a D3 are not the same, then the existing data
+					 * in \a x is cast to \a D3 prior to application of this operator.
+					 * If \a InputType2 and \a D2 are not the same, then the existing data in
+					 * \a y is cast to \a D2 prior to application of this operator.
+					 * If \a OutputType and \a D3 are not the same, then the result of
+					 * applications of this operator are cast to \a OutputType prior to
+					 * writing it back to \a z.
+					 *
+					 * \warning The first casting behaviour may not be what you want. The two
+					 *          other casting behaviours are allowed by the GraphBLAS unless
+					 *          the grb::descriptor::no_casting is given.
+					 *
+					 * \note By default, this GraphBLAS implementation will only use this
+					 *       code when \a D1 matches \a D3 and OP::has_foldr is \a true.
+					 *       However, this implementation will never be enabled if \a D2
+					 *       equals \a D3 and OP::has_foldl is \a true.
+					 *
+					 * This implementation relies on an in-place foldl().
+					 *
+					 * @param[in]  x The left-hand side input data. The memory range starting
+					 *               at \a x and ending at \a x + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n (exclusive).
+					 * @param[in]  y The right-hand side input data. The memory range starting
+					 *               at \a y and ending at \a y + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n.
+					 * @param[out] z Where the map of \a x into \a y must be stored. This
+					 *               pointer is restricted in the sense that its memory may
+					 *               never overlap with those pointed to by \a x or \y, as
+					 *               detailed above.
+					 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+					 */
+					template< typename InputType1, typename InputType2, typename OutputType >
+					static void eWiseApply(
+						const InputType1 * x,
+						const InputType2 * y,
+						OutputType * __restrict__ z,
+						const size_t n
+					) {
+#ifdef _DEBUG
+ #ifdef D_GRB_NO_STDIO
+						std::cout << "In OperatorNoFR::eWiseApply\n";
+ #endif
+#endif
+						// NOTE: this variant is only active when the computation can be done
+						//       using two buffers only
+
+						// local buffers
+						D2 right_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+								result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+							}
 						}
 
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+							result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+							OP::foldl( result_buffer, right_buffer );
+							z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
 						}
 					}
 
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
-						result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
-						OP::foldl( result_buffer, right_buffer );
-						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
-					}
-				}
 			};
 
 			/**
@@ -2832,7 +3779,10 @@ namespace grb {
 			 * @see OperatorBase for additional functions exposed to the resulting
 			 *                   operator.
 			 */
-			template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+			template<
+				typename OP, typename guard = void,
+				enum Backend implementation = config::default_backend
+			>
 			class OperatorNoFRFL : public OperatorNoFR< OP > {};
 
 			/**
@@ -2862,105 +3812,119 @@ namespace grb {
 			 */
 			template< typename OP >
 			class OperatorNoFRFL< OP,
-				typename std::enable_if< ( ! ( OP::has_foldl ) || ! ( std::is_same< typename OP::left_type, typename OP::result_type >::value ) ) &&
-					( ! ( OP::has_foldr ) || ! ( std::is_same< typename OP::right_type, typename OP::result_type >::value ) ) >::type > : public OperatorNoFR< OP > {
-
-			private:
-			public:
-				typedef typename OperatorBase< OP >::D1 D1;
-				typedef typename OperatorBase< OP >::D2 D2;
-				typedef typename OperatorBase< OP >::D3 D3;
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-				/** \anchor OperatorNoFRFLeWiseApply
-				 *
-				 * Standard out-of-place element-wise apply function. Calculates
-				 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
-				 * \f$ z_i = x_i \odot y_i \f$.
-				 *
-				 * This is the non-public variant that operates on raw arrays.
-				 *
-				 * @tparam InputType1 The type of elements in \a x.
-				 * @tparam InputType2 The type of elements in \a y.
-				 * @tparam OutputType The type of elements in \a z.
-				 *
-				 * If \a InputType1 and \a D1 are not the same, then the existing data in
-				 * \a x will be cast to \a D1 prior to application of this operator.
-				 * If \a InputType2 and \a D2 are not the same, then the existing data in
-				 * \a y will be cast to \a D2 prior to application of this operator.
-				 * If \a OutputType and \a D3 are not the same, then the results of
-				 * applications of this operator are cast to \a OutputType prior to
-				 * writing them back to \a z.
-				 *
-				 * \note The GraphBLAS can explicitly control all \em three of this
-				 *       casting behaviours via grb::descriptors::no_casting.
-				 *
-				 * \warning With the in-place variants of this code, unwanted behaviour
-				 *          cannot be prevented by use of grb::descriptors::no_casting.
-				 *          Therefore the current implementation only calls the in-place
-				 *          variants when \a D1 equals \a D3 (for foldl-based in-place),
-				 *          or when \a D2 equals \a D3 (for foldr-based ones).
-				 *
-				 * @param[in]  x The left-hand side input data. The memory range starting
-				 *               at \a x and ending at \a x + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n (exclusive).
-				 * @param[in]  y The right-hand side input data. The memory range starting
-				 *               at \a y and ending at \a y + n (exclusive) may not
-				 *               overlap with the memory area starting at \a z and ending
-				 *               at \a z + n.
-				 * @param[out] z Where the map of \a x into \a y must be stored. This
-				 *               pointer is restricted in the sense that its memory may
-				 *               never overlap with those pointed to by \a x or \y, as
-				 *               detailed above.
-				 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
-				 */
-				template< typename InputType1, typename InputType2, typename OutputType >
-				static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
+				typename std::enable_if< (
+						!(OP::has_foldl) ||
+						!(std::is_same< typename OP::left_type, typename OP::result_type >::value)
+					) && (
+						!(OP::has_foldr) ||
+						!(std::is_same< typename OP::right_type, typename OP::result_type >::value)
+					)
+				>::type
+			> : public OperatorNoFR< OP > {
+
+				public:
+
+					typedef typename OperatorBase< OP >::D1 D1;
+					typedef typename OperatorBase< OP >::D2 D2;
+					typedef typename OperatorBase< OP >::D3 D3;
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+					/** \anchor OperatorNoFRFLeWiseApply
+					 *
+					 * Standard out-of-place element-wise apply function. Calculates
+					 * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+					 * \f$ z_i = x_i \odot y_i \f$.
+					 *
+					 * This is the non-public variant that operates on raw arrays.
+					 *
+					 * @tparam InputType1 The type of elements in \a x.
+					 * @tparam InputType2 The type of elements in \a y.
+					 * @tparam OutputType The type of elements in \a z.
+					 *
+					 * If \a InputType1 and \a D1 are not the same, then the existing data in
+					 * \a x will be cast to \a D1 prior to application of this operator.
+					 * If \a InputType2 and \a D2 are not the same, then the existing data in
+					 * \a y will be cast to \a D2 prior to application of this operator.
+					 * If \a OutputType and \a D3 are not the same, then the results of
+					 * applications of this operator are cast to \a OutputType prior to
+					 * writing them back to \a z.
+					 *
+					 * \note The GraphBLAS can explicitly control all \em three of this
+					 *       casting behaviours via grb::descriptors::no_casting.
+					 *
+					 * \warning With the in-place variants of this code, unwanted behaviour
+					 *          cannot be prevented by use of grb::descriptors::no_casting.
+					 *          Therefore the current implementation only calls the in-place
+					 *          variants when \a D1 equals \a D3 (for foldl-based in-place),
+					 *          or when \a D2 equals \a D3 (for foldr-based ones).
+					 *
+					 * @param[in]  x The left-hand side input data. The memory range starting
+					 *               at \a x and ending at \a x + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n (exclusive).
+					 * @param[in]  y The right-hand side input data. The memory range starting
+					 *               at \a y and ending at \a y + n (exclusive) may not
+					 *               overlap with the memory area starting at \a z and ending
+					 *               at \a z + n.
+					 * @param[out] z Where the map of \a x into \a y must be stored. This
+					 *               pointer is restricted in the sense that its memory may
+					 *               never overlap with those pointed to by \a x or \y, as
+					 *               detailed above.
+					 * @param[in]  n How many data elements \a x, \a y, and \a z contain.
+					 */
+					template< typename InputType1, typename InputType2, typename OutputType >
+					static void eWiseApply(
+						const InputType1 * x,
+						const InputType2 * y,
+						OutputType * __restrict__ z,
+						const size_t n
+					) {
 #ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
-					std::cout << "In OperatorNoFRFL::eWiseApply\n";
-#endif
+ #ifdef D_GRB_NO_STDIO
+						std::cout << "In OperatorNoFRFL::eWiseApply\n";
+ #endif
 #endif
-					// NOTE: this variant is only active when the computation can NOT be done using two buffers only
-
-					// local buffers
-					D1 left_buffer[ blocksize ];
-					D2 right_buffer[ blocksize ];
-					D3 result_buffer[ blocksize ];
-
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-
-						// load into buffers
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-							right_buffer[ b ] = static_cast< D2 >( y[ i ] );
-						}
-
-						// rewind source and output
-						i -= blocksize;
-
-						// operate within buffer
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+						// NOTE: this variant is only active when the computation can NOT be done using two buffers only
+
+						// local buffers
+						D1 left_buffer[ blocksize ];
+						D2 right_buffer[ blocksize ];
+						D3 result_buffer[ blocksize ];
+
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+
+							// load into buffers
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+								right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+							}
+
+							// rewind source and output
+							i -= blocksize;
+
+							// operate within buffer
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+									&( result_buffer[ b ] ) );
+							}
+
+							// write back result
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+							}
 						}
 
-						// write back result
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+							OP::apply( left_buffer, right_buffer, result_buffer );
+							z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
 						}
 					}
 
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
-						OP::apply( left_buffer, right_buffer, result_buffer );
-						z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
-					}
-				}
 			};
 
 			/**
@@ -3085,108 +4049,117 @@ namespace grb {
 			 * \snippet ops.hpp Operator Type Traits
 			 * \endparblock
 			 */
-			template< typename OP, enum Backend implementation = config::default_backend >
+			template<
+				typename OP,
+				enum Backend implementation = config::default_backend
+			>
 			class Operator : public OperatorNoFRFL< OP > {
 
-			private:
-			public:
-				/** The maximum block size when vectorising this operation. */
-				static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
-				/** The left-hand side input domain of this operator. */
-				typedef typename OperatorBase< OP >::D1 D1;
-
-				/** The right-hand side input domain of this operator. */
-				typedef typename OperatorBase< OP >::D2 D2;
-
-				/** The output domain of this operator. */
-				typedef typename OperatorBase< OP >::D3 D3;
-
-				/**
-				 * Reduces a vector of type \a InputType into a value in \a IOType
-				 * by repeated application of this operator. The \a IOType is cast
-				 * into \a D3 prior reduction. The \a InputType is cast into \a D1
-				 * during reduction. The final result is cast to IOType after
-				 * reduction. The reduction happens `right-to-left'.
-				 *
-				 * This implementation relies on the \a foldr, whether it be an
-				 * true in-place or emulated version.
-				 *
-				 * @param[in,out] out On input, the initial value to be used for
-				 *                    reduction. On output, all elements of \a x
-				 *                    have been applied to \a out.
-				 * @param[in] x A vector of size \a n with elements of type \a left_type.
-				 * @param[in] n A positive integer (can be 0).
-				 */
-				template< typename IOType, typename InputType >
-				static void foldrArray( const InputType * __restrict__ const x, IOType & out, const size_t n ) {
-					// prepare scalar buffer
-					D3 reduced = static_cast< D3 >( out );
-					// prepare vectorisation buffer
-					D1 left_buffer[ blocksize ];
-					// blockwise application
-					size_t i = n - 1;
-					while( i - blocksize + 1 < n ) {
-						// load into buffer
-						for( size_t b = 0; b < blocksize; --i, ++b ) {
-							left_buffer[ b ] = static_cast< D1 >( x[ i ] );
-						}
-						// do reduce
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldr( &( left_buffer[ b ] ), &reduced );
-						}
-					}
-					// direct application for remainder
-					for( ; i < n; --i ) {
-						left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
-						OP::foldr( left_buffer, &reduced );
-					}
-					// write out
-					out = static_cast< IOType >( reduced );
-				}
-
-				/**
-				 * Reduces a vector of type \a InputType into a value in \a IOType
-				 * by repeated application of this operator. The \a IOType is cast
-				 * into \a D3 prior reduction. The \a InputType is cast into \a D2
-				 * during reduction. The final result is cast to IOType after
-				 * reduction. The reduction happens `left-to-right'.
-				 *
-				 * This implementation relies on the \a foldr, whether it be an
-				 * true in-place or emulated version.
-				 *
-				 * @param[in,out] out On input, the initial value to be used for
-				 *                    reduction. On output, all elements of \a x
-				 *                    have been applied to \a out.
-				 * @param[in] x A vector of size \a n with elements of type \a left_type.
-				 * @param[in] n A positive integer (can be 0).
-				 */
-				template< typename IOType, typename InputType >
-				static void foldlArray( IOType & out, const InputType * __restrict__ const x, const size_t n ) {
-					// prepare scalar buffer
-					D3 reduced = static_cast< D3 >( out );
-					// prepare vectorisation buffer
-					D2 right_buffer[ blocksize ];
-					// blockwise application
-					size_t i = 0;
-					while( i + blocksize <= n ) {
-						// load into buffer
-						for( size_t b = 0; b < blocksize; ++i, ++b ) {
-							right_buffer[ b ] = static_cast< D2 >( x[ i ] );
-						}
-						// do reduce
-						for( size_t b = 0; b < blocksize; ++b ) {
-							OP::foldl( &reduced, &( right_buffer[ b ] ) );
-						}
-					}
-					// direct application for remainder
-					for( ; i < n; ++i ) {
-						right_buffer[ 0 ] = static_cast< D2 >( x[ i ] );
-						OP::foldl( &reduced, right_buffer );
-					}
-					// write out
-					out = static_cast< IOType >( reduced );
-				}
+				public:
+
+					/** The maximum block size when vectorising this operation. */
+					static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+					/** The left-hand side input domain of this operator. */
+					typedef typename OperatorBase< OP >::D1 D1;
+
+					/** The right-hand side input domain of this operator. */
+					typedef typename OperatorBase< OP >::D2 D2;
+
+					/** The output domain of this operator. */
+					typedef typename OperatorBase< OP >::D3 D3;
+
+					/**
+					 * Reduces a vector of type \a InputType into a value in \a IOType
+					 * by repeated application of this operator. The \a IOType is cast
+					 * into \a D3 prior reduction. The \a InputType is cast into \a D1
+					 * during reduction. The final result is cast to IOType after
+					 * reduction. The reduction happens `right-to-left'.
+					 *
+					 * This implementation relies on the \a foldr, whether it be an
+					 * true in-place or emulated version.
+					 *
+					 * @param[in,out] out On input, the initial value to be used for
+					 *                    reduction. On output, all elements of \a x
+					 *                    have been applied to \a out.
+					 * @param[in] x A vector of size \a n with elements of type \a left_type.
+					 * @param[in] n A positive integer (can be 0).
+					 */
+					template< typename IOType, typename InputType >
+					static void foldrArray(
+						const InputType * __restrict__ const x,
+						IOType &out,
+						const size_t n
+					) {
+						// prepare scalar buffer
+						D3 reduced = static_cast< D3 >( out );
+						// prepare vectorisation buffer
+						D1 left_buffer[ blocksize ];
+						// blockwise application
+						size_t i = n - 1;
+						while( i - blocksize + 1 < n ) {
+							// load into buffer
+							for( size_t b = 0; b < blocksize; --i, ++b ) {
+								left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+							}
+							// do reduce
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldr( &( left_buffer[ b ] ), &reduced );
+							}
+						}
+						// direct application for remainder
+						for( ; i < n; --i ) {
+							left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+							OP::foldr( left_buffer, &reduced );
+						}
+						// write out
+						out = static_cast< IOType >( reduced );
+					}
+
+					/**
+					 * Reduces a vector of type \a InputType into a value in \a IOType
+					 * by repeated application of this operator. The \a IOType is cast
+					 * into \a D3 prior reduction. The \a InputType is cast into \a D2
+					 * during reduction. The final result is cast to IOType after
+					 * reduction. The reduction happens `left-to-right'.
+					 *
+					 * This implementation relies on the \a foldr, whether it be an
+					 * true in-place or emulated version.
+					 *
+					 * @param[in,out] out On input, the initial value to be used for
+					 *                    reduction. On output, all elements of \a x
+					 *                    have been applied to \a out.
+					 * @param[in] x A vector of size \a n with elements of type \a left_type.
+					 * @param[in] n A positive integer (can be 0).
+					 */
+					template< typename IOType, typename InputType >
+					static void foldlArray(
+						IOType &out, const InputType * __restrict__ const x, const size_t n
+					) {
+						// prepare scalar buffer
+						D3 reduced = static_cast< D3 >( out );
+						// prepare vectorisation buffer
+						D2 right_buffer[ blocksize ];
+						// blockwise application
+						size_t i = 0;
+						while( i + blocksize <= n ) {
+							// load into buffer
+							for( size_t b = 0; b < blocksize; ++i, ++b ) {
+								right_buffer[ b ] = static_cast< D2 >( x[ i ] );
+							}
+							// do reduce
+							for( size_t b = 0; b < blocksize; ++b ) {
+								OP::foldl( &reduced, &( right_buffer[ b ] ) );
+							}
+						}
+						// direct application for remainder
+						for( ; i < n; ++i ) {
+							right_buffer[ 0 ] = static_cast< D2 >( x[ i ] );
+							OP::foldl( &reduced, right_buffer );
+						}
+						// write out
+						out = static_cast< IOType >( reduced );
+					}
 			};
 
 		} // namespace internal
diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp
index 4eb1a80fd..c0ed7e1cc 100644
--- a/include/graphblas/base/io.hpp
+++ b/include/graphblas/base/io.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies all I/O primitives for use with ALP/GraphBLAS containers.
+ *
  * @author A. N. Yzelman
  * @date 21st of February, 2017
  */
@@ -41,7 +45,8 @@
 namespace grb {
 
 	/**
-	 * \defgroup IO Data Ingestion and Extraction.
+	 * \defgroup IO Data Ingestion and Extraction
+	 * \ingroup GraphBLAS
 	 *
 	 * Provides functions for putting user data into opaque ALP/GraphBLAS
 	 * containers, provides functions for extracting data from such containers,
@@ -58,7 +63,7 @@ namespace grb {
 	 * Sometimes it is desired to have direct access to ALP/GraphBLAS memory
 	 * area, and to have that memory available even after the ALP/GraphBLAS context
 	 * has been destroyed. This functionality is provided by the concept of
-	 * <em>pinned containers</em> such as provided by #PinnedVector.
+	 * <em>pinned containers</em> such as provided by #grb::PinnedVector.
 	 *
 	 * Containers may be instantiated with default or given requested capacities.
 	 * Implementations may reserve a higher capacity, but must allocate at least
@@ -400,6 +405,7 @@ namespace grb {
 	 * A call to this function shall always succeed and shall never throw
 	 * exceptions.
 	 *
+	 * \parblock
 	 * \par Performance semantics.
 	 * A call to this function:
 	 *    -# completes in \f$ \Theta(1) \f$ work.
@@ -546,21 +552,8 @@ namespace grb {
 	 *
 	 * @return grb::SUCCESS This function cannot fail.
 	 *
-	 * \parblock
-	 * \par Performance semantics.
-	 * The backend must:
-	 *    -# define cost in terms of work
-	 *    -# define intra-process data movement costs
-	 *    -# define inter-process data movement costs
-	 *    -# define inter-process synchronisation requirements
-	 *    -# define memory storage requirements and may define
-	 *       this in terms of \a new_nz.
-	 *    -# define whether system calls may be made and in particular whether
-	 *       dynamic memory management may occor.
-	 * \endparblock
-	 *
-	 * \warning Calling clear shall not clear any dynamically allocated
-	 *          memory associated with \a x.
+	 * \warning Calling clear may not free any dynamically allocated memory
+	 *          associated with \a x. None of the present backends in fact do so.
 	 *
 	 * \note Even #grb::resize may or may not free dynamically allocated memory
 	 *       associated with \a x-- depending on the memory usage semantics defined
@@ -568,6 +561,14 @@ namespace grb {
 	 *
 	 * \note Only the destruction of \a x would ensure all corresponding memory is
 	 *       freed, for all backends.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 * \endparblock
+	 *
 	 */
 	template< typename DataType, Backend backend, typename Coords >
 	RC clear( Vector< DataType, backend, Coords > &x ) noexcept {
@@ -595,20 +596,7 @@ namespace grb {
 	 * dimensions (i.e., row and column sizes) as well as the nonzero capacity
 	 * remains unchanged.
 	 *
-	 * @return grb::SUCCESS This function cannot fail.
-	 *
-	 * \parblock
-	 * \par Performance semantics.
-	 * The backend must:
-	 *    -# define cost in terms of work
-	 *    -# define intra-process data movement costs
-	 *    -# define inter-process data movement costs
-	 *    -# define inter-process synchronisation requirements
-	 *    -# define memory storage requirements and may define
-	 *       this in terms of \a new_nz.
-	 *    -# define whether system calls may be made and in particular whether
-	 *       dynamic memory management may occor.
-	 * \endparblock
+	 * @return #grb::SUCCESS This function cannot fail.
 	 *
 	 * \warning Calling clear may not clear any dynamically allocated
 	 *          memory associated with \a A.
@@ -619,6 +607,13 @@ namespace grb {
 	 *
 	 * \note Only the destruction of \a A would ensure all corresponding memory is
 	 *       freed, for all backends.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 * \endparblock
 	 */
 	template<
 		typename InputType, Backend backend,
@@ -691,27 +686,24 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Performance semantics.
-	 * The backend must:
-	 *    -# define cost in terms of work
-	 *    -# define intra-process data movement costs
-	 *    -# define inter-process data movement costs
-	 *    -# define inter-process synchronisation requirements
-	 *    -# define memory storage requirements and may define
-	 *       this in terms of \a new_nz.
-	 *    -# define whether system calls may be made and in particular whether
-	 *       dynamic memory management may occor.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 * \endparblock
 	 *
-	 * \warning For most implementations, this function will indeed imply system
-	 *          calls, as well as \f$ \Theta( \mathit{new\_nz} ) \f$ work and data
-	 *          movement costs. It is thus to be considered an expensive function,
-	 *          and should be used sparingly and only when absolutely necessary.
+	 * \warning For most implementations, this function will imply system calls, as
+	 *          well as \f$ \Theta( \mathit{new\_nz} ) \f$ work and data movement
+	 *          costs. It is thus to be considered an expensive function, and
+	 *          should be used sparingly and only when absolutely necessary.
 	 */
 	template<
 		typename InputType,
 		Backend backend, typename Coords
 	>
-	RC resize( Vector< InputType, backend, Coords > &x, const size_t new_nz ) noexcept {
+	RC resize(
+		Vector< InputType, backend, Coords > &x,
+		const size_t new_nz
+	) noexcept {
 #ifndef NDEBUG
 		const bool should_not_call_base_vector_resize = false;
 #endif
@@ -777,12 +769,9 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Performance semantics.
-	 *    -# the backend must define cost in terms of work
-	 *    -# the backend must define intra-process data movement costs
-	 *    -# the backend must define inter-process data movement costs
-	 *    -# the backend must define memory storage requirements and may define
-	 *       this in terms of \a new_nz.
-	 *    -# the backend must define whether system calls may be made.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 * \endparblock
 	 *
 	 * \warning For useful backends, this function will indeed imply system calls
@@ -827,9 +816,9 @@ namespace grb {
 	 *                  to the size of \a x.
 	 * @param[in]   val The value to set each element of \a x to.
 	 * @param[in] phase Which #grb::Phase the operation is requested. Optional;
-	 *                  the default is #grb::Phase::EXECUTE.
+	 *                  the default is #grb::EXECUTE.
 	 *
-	 * In #grb::Phase::RESIZE mode:
+	 * In #grb::RESIZE mode:
 	 *
 	 * @returns #grb::OUTOFMEM When \a x could not be resized to hold the
 	 *                         requested output, and the current capacity was
@@ -837,17 +826,17 @@ namespace grb {
 	 * @returns #grb::SUCCESS  When the capacity of \a x was resized to guarantee
 	 *                         the output of this operation can be contained.
 	 *
-	 * In #grb::Phase::EXECUTE mode:
+	 * In #grb::EXECUTE mode:
 	 *
 	 * @returns #grb::FAILED  When \a x did not have sufficient capacity. The
 	 *                        vector \a x on exit shall be cleared.
 	 * @returns #grb::SUCCESS When the call completes successfully.
 	 *
-	 * In #grb::Phase::TRY mode (experimental and may not be supported):
+	 * In #grb::TRY mode (experimental and may not be supported):
 	 *
 	 * @returns #grb::FAILED  When \a x did not have sufficient capacity. The
 	 *                        vector \a x on exit will have contents defined as
-	 *                        described for #grb::Phase::TRY.
+	 *                        described for #grb::TRY.
 	 * @returns #grb::SUCCESS When the call completes successfully.
 	 *
 	 * When \a descr includes grb::descriptors::no_casting and if \a T does not
@@ -855,12 +844,9 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Performance semantics
-	 * A backend must define, for each phase:
-	 *    -# cost in terms of work
-	 *    -# intra-process data movement costs
-	 *    -# inter-process data movement costs
-	 *    -# memory storage requirements and may define this in terms of \a new_nz.
-	 *    -# whether system calls may be made.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 * \endparblock
 	 */
 	template<
@@ -909,14 +895,14 @@ namespace grb {
 	 *                  evaluated depends on the given \a desc.
 	 * @param[in]   val The value to set elements of \a x to.
 	 * @param[in] phase Which #grb::Phase the operation is requested. Optional;
-	 *                  the default is #grb::Phase::EXECUTE.
+	 *                  the default is #grb::EXECUTE.
 	 *
 	 * \warning An empty \a mask, meaning #grb::size( \a mask ) is zero, shall
 	 *          be interpreted as though no mask argument was given. In particular,
 	 *          any descriptors pertaining to the interpretation of \a mask shall
 	 *          be ignored.
 	 *
-	 * In #grb::Phase::RESIZE mode:
+	 * In #grb::RESIZE mode:
 	 *
 	 * @returns #grb::OUTOFMEM When \a x could not be resized to hold the
 	 *                         requested output, and the current capacity was
@@ -924,17 +910,17 @@ namespace grb {
 	 * @returns #grb::SUCCESS  When the capacity of \a x was resized to guarantee
 	 *                         the output of this operation can be contained.
 	 *
-	 * In #grb::Phase::EXECUTE mode:
+	 * In #grb::EXECUTE mode:
 	 *
 	 * @returns #grb::FAILED  When \a x did not have sufficient capacity. The
 	 *                        vector \a x on exit shall be cleared.
 	 * @returns #grb::SUCCESS When the call completes successfully.
 	 *
-	 * In #grb::Phase::TRY mode (experimental and may not be supported):
+	 * In #grb::TRY mode (experimental and may not be supported):
 	 *
 	 * @returns #grb::FAILED  When \a x did not have sufficient capacity. The
 	 *                        vector \a x on exit will have contents defined as
-	 *                        described for #grb::Phase::TRY.
+	 *                        described for #grb::TRY.
 	 * @returns #grb::SUCCESS When the call completes successfully.
 	 *
 	 * When \a descr includes grb::descriptors::no_casting and if \a T does not
@@ -942,13 +928,9 @@ namespace grb {
 	 *
 	 * \parblock
 	 * \par Performance semantics
-	 * A backend must define, for each phase:
-	 *    -# cost in terms of work;
-	 *    -# intra-process data movement costs;
-	 *    -# inter-process data movement costs;
-	 *    -# inter-process synchronisation costs;
-	 *    -# memory storage requirements and may define this in terms of \a new_nz;
-	 *    -# whether system calls may be made.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
 	 * \endparblock
 	 */
 	template<
@@ -956,7 +938,8 @@ namespace grb {
 		typename DataType, typename MaskType, typename T,
 		Backend backend, typename Coords
 	>
-	RC set( Vector< DataType, reference, Coords > &x,
+	RC set(
+		Vector< DataType, reference, Coords > &x,
 		const Vector< MaskType, backend, Coords > &mask,
 		const T val,
 		const Phase &phase = EXECUTE,
@@ -979,14 +962,6 @@ namespace grb {
 	 * Sets the content of a given vector \a x to be equal to that of another given
 	 * vector \a y.
 	 *
-	 * Unmasked variant.
-	 *
-	 * \parblock
-	 * \par Accepted descriptors
-	 *   -# grb::descriptors::no_operation
-	 *   -# grb::descriptors::no_casting
-	 * \endparblock
-	 *
 	 * @tparam descr The descriptor of the operation.
 	 * @tparam OutputType The type of each element in the output vector.
 	 * @tparam InputType  The type of each element in the input vector.
@@ -996,34 +971,32 @@ namespace grb {
 	 *
 	 * The vector \a x may not be the same as \a y.
 	 *
+	 * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+	 *                  the default is #grb::EXECUTE.
+	 *
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   -# grb::descriptors::no_operation
+	 *   -# grb::descriptors::no_casting
+	 *
 	 * When \a descr includes grb::descriptors::no_casting and if \a InputType
 	 * does not match \a OutputType, the code shall not compile.
+	 * \endparblock
 	 *
 	 * \parblock
 	 * \par Performance semantics
-	 * A call to this function
-	 *   -# consists of \f$ \Theta(n) \f$ work;
-	 *   -# moves \f$ \Theta(n) \f$ bytes of memory;
-	 *   -# does not allocate nor free any dynamic memory;
-	 *   -# shall not make any system calls.
-	 * \endparblock
-	 *
-	 * @see grb::foldl.
-	 * @see grb::foldr.
-	 * @see grb::operators::left_assign.
-	 * @see grb::operators::right_assign.
-	 * @see grb::setElement.
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * \todo Revise specification regarding recent changes on phases, performance
-	 *       semantics, and capacities.
+	 * @see perfSemantics
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename InputType,
 		Backend backend, typename Coords
 	>
-	RC set( Vector<
-		OutputType, backend, Coords > &x,
+	RC set(
+		Vector< OutputType, backend, Coords > &x,
 		const Vector< InputType, backend, Coords > &y,
 		const Phase &phase = EXECUTE
 	) {
@@ -1041,50 +1014,50 @@ namespace grb {
 	 * Sets the content of a given vector \a x to be equal to that of
 	 * another given vector \a y.
 	 *
-	 * Masked variant.
+	 * If an entry with index \a i has that the corresponding \a mask entry
+	 * evaluates <tt>false</tt>, then that entry shall not be copied into \a x.
 	 *
 	 * The vector \a x may not equal \a y.
 	 *
-	 * @tparam descr The descriptor of the operation.
+	 * @tparam descr      The descriptor of the operation. Optional; default
+	 *                    value is #grb::descriptors::no_operation.
 	 * @tparam OutputType The type of each element in the output vector.
 	 * @tparam MaskType   The type of each element in the mask vector.
 	 * @tparam InputType  The type of each element in the input vector.
 	 *
-	 * \parblock
-	 * \par Accepted descriptors
-	 *   -# grb::descriptors::no_operation
-	 *   -# grb::descriptors::no_casting
-	 *   -# grb::descriptors::invert_mask
-	 *   -# grb::descriptors::structural_mask
-	 * \endparblock
-	 *
 	 * @param[in,out] x The vector to be set.
 	 * @param[in]  mask The output mask.
-	 * @param[in]     y The source vector.
+	 * @param[in]     y The source vector. May not equal \a y.
+	 * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+	 *                  the default is #grb::EXECUTE.
 	 *
-	 * When \a descr includes grb::descriptors::no_casting and if \a InputType
+	 * \parblock
+	 * \par Accepted descriptors
+	 *   - #grb::descriptors::no_operation,
+	 *   - #grb::descriptors::no_casting,
+	 *   - #grb::descriptors::dense,
+	 *   - #grb::descriptors::invert_mask,
+	 *   - #grb::descriptors::structural, and
+	 *   - #grb::descriptors::structural_complement.
+	 *
+	 * When \a descr includes #grb::descriptors::no_casting and if \a InputType
 	 * does not match \a OutputType, the code shall not compile.
+	 * \endparblock
 	 *
 	 * \parblock
 	 * \par Performance semantics
-	 * A call to this function
-	 *   -# consists of \f$ \Theta( \min\{ nnz( mask ), nnz( y ) \} ) \f$ work;
-	 *   -# moves \f$ \Theta( \min\{ nnz( mask ), nnz( y ) \} ) \f$ bytes of memory;
-	 *   -# does not allocate nor free any dynamic memory;
-	 *   -# shall not make any system calls.
-	 * If grb::descriptors::invert_mask is given, then \f$ nnz( mask ) \f$ in the
-	 * above shall be considered equal to \f$ nnz( y ) \f$.
-	 * \endparblock
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * \todo Revise specification regarding recent changes on phases, performance
-	 *       semantics, and capacities.
+	 * @see perfSemantics
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename OutputType, typename MaskType, typename InputType,
 		Backend backend, typename Coords
 	>
-	RC set( Vector< OutputType, backend, Coords > &x,
+	RC set(
+		Vector< OutputType, backend, Coords > &x,
 		const Vector< MaskType, backend, Coords > &mask,
 		const Vector< InputType, backend, Coords > &y,
 		const Phase &phase = EXECUTE,
@@ -1115,45 +1088,45 @@ namespace grb {
 	 * The parameter \a i may not be greater or equal than the size of \a x.
 	 *
 	 * @tparam descr    The descriptor to be used during evaluation of this
-	 *                  function.
+	 *                  function. Optional; the default descriptor is
+	 *                  #grb::descriptors::no_operation.
 	 * @tparam DataType The type of the elements of \a x.
 	 * @tparam T        The type of the value to be set.
 	 *
 	 * @param[in,out] x The vector to be modified.
 	 * @param[in]   val The value \f$ x_i \f$ should read after function exit.
 	 * @param[in]     i The index of the element of \a x to set.
+	 * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+	 *                  the default is #grb::EXECUTE.
 	 *
-	 * @return grb::SUCCESS   Upon successful execution of this operation.
-	 * @return grb::MISMATCH  If \a i is greater or equal than the dimension of
-	 *                        \a x.
+	 * @return #grb::SUCCESS   Upon successful execution of this operation.
+	 * @return #grb::MISMATCH  If \a i is greater or equal than the dimension of
+	 *                         \a x.
 	 *
 	 * \parblock
 	 * \par Accepted descriptors
-	 *   -# grb::descriptors::no_operation
-	 *   -# grb::descriptors::no_casting
+	 *   - #grb::descriptors::no_operation,
+	 *   - #grb::descriptors::no_casting,
+	 *   - #grb::descriptors::dense.
 	 * \endparblock
 	 *
-	 * When \a descr includes grb::descriptors::no_casting and if \a T does not
+	 * When \a descr includes #grb::descriptors::no_casting and if \a T does not
 	 * match \a DataType, the code shall not compile.
 	 *
 	 * \parblock
 	 * \par Performance semantics
-	 * A call to this function
-	 *   -# consists of \f$ \Theta(1) \f$ work;
-	 *   -# moves \f$ \Theta(1) \f$ bytes of memory;
-	 *   -# does not allocate nor free any dynamic memory;
-	 *   -# shall not make any system calls.
-	 * \endparblock
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * \todo Revise specification regarding recent changes on phases, performance
-	 *       semantics, and capacities.
+	 * @see perfSemantics
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		typename DataType, typename T,
 		Backend backend, typename Coords
 	>
-	RC setElement( Vector< DataType, backend, Coords > &x,
+	RC setElement(
+		Vector< DataType, backend, Coords > &x,
 		const T val,
 		const size_t i,
 		const Phase &phase = EXECUTE,
@@ -1234,56 +1207,36 @@ namespace grb {
 	 * of elements. Any pre-existing nonzeroes that do not overlap with any nonzero
 	 * between \a ind_start and \a ind_end will remain unchanged.
 	 *
+	 * @return #grb::SUCCESS When ingestion has completed successfully.
+	 * @return #grb::ILLEGAL When a nonzero has an index larger than #grb::size.
+	 * @return #grb::PANIC   If an unmitigable error has occured during ingestion.
+	 *
 	 * \parblock
-	 * \par Performance semantics:
-	 * A call to this function
-	 *   -# comprises \f$ \mathcal{O}( n ) \f$ work where \a n is the number of
-	 *      elements pointed to by the given iterator pairs. This work may be
-	 *      distributed over multiple user processes.
-	 *   -# results in at most \f$   n \mathit{sizeof}( T ) +
-	 *                               n \mathit{sizeof}( U ) +
-	 *                               n \mathit{sizeof}( \mathit{InputType} ) +
-	 *                             2 n \mathit{sizeof}( \mathit{bool} ) \f$
-	 *      bytes of data movement, where \a T and \a U are the underlying data
-	 *      types of the input iterators. These costs may be distributed over
-	 *      multiple user processes.
-	 *   -# inter-process communication costs are \f$ \mathcal{O}(n) g + l \f$.
-	 *   -# if the capacity of this vector is not large enough to hold \a n
-	 *      elements, a call to this function may allocate
-	 *         \f$ \mathcal{O}( n ) \f$
-	 *      new bytes of memory which \em may be distributed over multiple user
-	 *      processes.
-	 *   -# if the capacity of this vector is not large enough to hold \a n
-	 *      elements, a call to this function may result in system calls at any of
-	 *      the user processes.
-	 *   -# If the IOMode is sequential, then the work and data movement costs are
-	 *      incurred <em>per user process</em> and will not be distributed. In this
-	 *      case the inter-process communication costs will, however, be zero.
-	 *   -# if the IOMode is parallel, then a good implementation under a uniformly
-	 *      randomly distributed input incurs an inter-process communication cost
-	 *      of expected value \f$ n/p g + l \f$. The best-case inter-process cost
-	 *      is \f$ (p-1)g + l \f$.
-	 * \endparblock
+	 * \par Performance semantics
+	 * Each backend must define performance semantics for this primitive.
 	 *
-	 * @returns grb::SUCCESS When ingestion has completed successfully.
-	 * @returns grb::ILLEGAL When a nonzero has an index larger than grb::size(x).
-	 * @returns grb::PANIC   If an unmitigable error has occured during ingestion.
+	 * @see perfSemantics
+	 * \endparblock
 	 */
-	template< Descriptor descr = descriptors::no_operation,
+	template<
+		Descriptor descr = descriptors::no_operation,
 		typename InputType,
 		class Merger = operators::right_assign< InputType >,
 		typename fwd_iterator1, typename fwd_iterator2,
 		Backend backend, typename Coords
 	>
-	RC buildVectorUnique( Vector< InputType, backend, Coords > &x,
+	RC buildVectorUnique(
+		Vector< InputType, backend, Coords > &x,
 		fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
 		fwd_iterator2 val_start, const fwd_iterator2 val_end,
 		const IOMode mode
 	) {
-		return buildVector< descr | descriptors::no_duplicates >( x,
+		return buildVector< descr | descriptors::no_duplicates >(
+			x,
 			ind_start, ind_end,
 			val_start, val_end,
-			mode );
+			mode
+		);
 	}
 
 	/**
@@ -1291,8 +1244,15 @@ namespace grb {
 	 *
 	 * Invalidates any prior existing content. Disallows different nonzeroes
 	 * to have the same row and column coordinates; input must consist out of
-	 * unique triples. See #buildMatrix for an alternate function that does
-	 * not have these restrictions-- at the cost of lower performance.
+	 * unique triples.
+	 *
+	 * \internal
+	 * See #buildMatrix for an alternate function that does not have these
+	 * restrictions-- at the cost of lower performance.
+	 *
+	 * \todo Re-enable the above for public documentation once the non-unique
+	 *       buildMatrix variant has been implemented.
+	 * \endinternal
 	 *
 	 * \warning Calling this function with duplicate input coordinates will
 	 *          lead to undefined behaviour.
@@ -1306,58 +1266,49 @@ namespace grb {
 	 * @tparam fwd_iterator3 The type of the nonzero value iterator.
 	 * @tparam length_type   The type of the number of elements in each iterator.
 	 *
-	 * The iterators will only be used to read from, never to assign to.
+	 * @param[out] A Where to store the given nonzeroes.
 	 *
 	 * @param[in] I  A forward iterator to \a cap row indices.
 	 * @param[in] J  A forward iterator to \a cap column indices.
 	 * @param[in] V  A forward iterator to \a cap nonzero values.
-	 * @param[in] nz The number of items pointed to by \a I, \a J, \em and \a V.
-	 *
-	 * @return grb::MISMATCH -# when an element from \a I dereferences to a value
-	 *                          larger than the row dimension of this matrix, or
-	 *                       -# when an element from \a J dereferences to a value
-	 *                          larger than the column dimension of this matrix.
-	 *                       When this error code is returned the state of this
-	 *                       container will be as though this function was never
-	 *                       called; however, the given forward iterators may
-	 *                       have been copied and the copied iterators may have
-	 *                       incurred multiple increments and dereferences.
-	 * @return grb::OVERFLW  When the internal data type used for storing the
-	 *                       number of nonzeroes is not large enough to store
-	 *                       the number of nonzeroes the user wants to assign.
-	 *                       When this error code is returned the state of this
-	 *                       container will be as though this function was never
-	 *                       called; however, the given forward iterators may
-	 *                       have been copied and the copied iterators may have
-	 *                       incurred multiple increments and dereferences.
-	 * @return grb::SUCCESS  When the function completes successfully.
 	 *
-	 * \parblock
-	 * \par Performance semantics.
-	 *        -# This function contains
-	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ amount of work.
-	 *        -# This function may dynamically allocate
-	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of memory.
-	 *        -# A call to this function will use \f$ \mathcal{O}(m+n) \f$ bytes
-	 *           of memory beyond the memory in use at the function call entry.
-	 *        -# This function will copy each input forward iterator at most
-	 *           \em once; the three input iterators \a I, \a J, and \a V thus
-	 *           may have exactly one copyeach, meaning that all input may be
-	 *           traversed only once.
-	 *        -# Each of the at most three iterator copies will be incremented
-	 *           at most \f$ \mathit{nz} \f$ times.
-	 *        -# Each position of the each of the at most three iterator copies
-	 *           will be dereferenced exactly once.
-	 *        -# This function moves
-	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of data.
-	 *        -# This function will likely make system calls.
-	 * \endparblock
+	 * @param[in] I_end A forward iterator in end position relative to \a I.
+	 * @param[in] J_end A forward iterator in end position relative to \a J.
+	 * @param[in] V_end A forward iterator in end position relative to \a V.
+	 *
+	 * The iterators will only be used to read from, never to assign to.
+	 *
+	 * @param[in] mode Whether the input should happen in #grb::SEQUENTIAL or in
+	 *                 the #grb::PARALLEL mode.
+	 *
+	 * In the below, let \a nz denote the number of items pointed to by the
+	 * iterator pair \a I, \a I_end. This number should match the number of
+	 * elements in \a J, \a J_end \em and \a V, \a V_end.
+	 *
+	 * @return #grb::SUCCESS  When the function completes successfully.
+	 * @return #grb::MISMATCH When an element from \a I dereferences to a value
+	 *                        larger than the row dimension of this matrix, or
+	 *                        when an element from \a J dereferences to a value
+	 *                        larger than the column dimension of this matrix.
+	 *                        When this error code is returned the state of this
+	 *                        container will be as though this function was never
+	 *                        called; however, the given forward iterators may
+	 *                        have been copied and the copied iterators may have
+	 *                        incurred multiple increments and dereferences.
+	 * @return #grb::OVERFLW  When the internal data type used for storing the
+	 *                        number of nonzeroes is not large enough to store
+	 *                        the number of nonzeroes the user wants to assign.
+	 *                        When this error code is returned the state of this
+	 *                        container will be as though this function was never
+	 *                        called; however, the given forward iterators may
+	 *                        have been copied and the copied iterators may have
+	 *                        incurred multiple increments and dereferences.
 	 *
 	 * \warning This is an expensive function. Use sparingly and only when
 	 *          absolutely necessary.
 	 *
 	 * \note Streaming input can be implemented by supplying buffered
-	 *       iterators to this GraphBLAS implementation.
+	 *       iterators to ALP.
 	 *
 	 * \note The functionality herein described is exactly that of buildMatrix,
 	 *       though with stricter input requirements. These requirements allow
@@ -1366,6 +1317,13 @@ namespace grb {
 	 * \note No masked version of this variant is provided. The use of masks in
 	 *       matrix construction is costly and the user is referred to the
 	 *       costly buildMatrix() function instead.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 * Each backend must define performance semantics for this primitive.
+	 *
+	 * @see perfSemantics
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -1484,6 +1442,8 @@ namespace grb {
 	 *                   \a end.
 	 * @param[in]  start Iterator pointing to the first nonzero to be added.
 	 * @param[in]   end  Iterator pointing past the last nonzero to be added.
+	 * @param[in]  mode  Whether the input should happen in #grb::SEQUENTIAL or in
+	 *                   the #grb::PARALLEL mode.
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/base/matrix.hpp b/include/graphblas/base/matrix.hpp
index 2744a7434..343729d17 100644
--- a/include/graphblas/base/matrix.hpp
+++ b/include/graphblas/base/matrix.hpp
@@ -15,9 +15,13 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies the ALP/GraphBLAS matrix container.
+ *
  * @author A. N. Yzelman
- * @date 10th of August
+ * @date 10th of August, 2016
  */
 
 #ifndef _H_GRB_MATRIX_BASE
@@ -190,15 +194,9 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * Implementations must define cost semantics across the following
-			 * dimensions:
-			 *   -# work;
-			 *   -# intra-process data movement;
-			 *   -# inter-process data movement;
-			 *   -# inter-process synchronisations;
-			 *   -# memory usage; and
-			 *   -# whether system calls, in particular dynamic memory management calls,
-			 *      could occur.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 *
 			 * \warning Avoid the use of this constructor within performance critical
@@ -251,25 +249,21 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * Implementations must define cost semantics across the following
-			 * dimensions:
-			 *   -# work;
-			 *   -# intra-process data movement;
-			 *   -# inter-process data movement;
-			 *   -# inter-process synchronisations;
-			 *   -# memory usage; and
-			 *   -# whether system calls, in particular dynamic memory management calls,
-			 *      could occur.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 *
 			 * \warning Avoid the use of this constructor within performance critical
 			 *          code sections.
 			 */
-			Matrix( const Matrix<
-				D, implementation,
-				RowIndexType, ColIndexType, NonzeroIndexType > &other
+			Matrix(
+				const Matrix<
+					D, implementation,
+					RowIndexType, ColIndexType, NonzeroIndexType
+				> &other
 			) {
-				(void)other;
+				(void) other;
 			}
 
 			/**
@@ -281,18 +275,13 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * This constructor:
-			 *        -# entails \f$ \Theta(1) \f$ amount of work;
-			 *        -# moves \f$ \Theta(1) \f$ bytes of data within its user process;
-			 *        -# moves \f$ 0 \f$ bytes of data between user processes;
-			 *        -# shall \em not require synchronisations between user processes;
-			 *        -# inherit the memory usage of \a other;
-			 *        -# will \em not make system calls and in particular will not free
-			 *           nor allocate dynamic memory.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 */
 			Matrix( self_type &&other ) {
-				(void)other;
+				(void) other;
 			}
 
 			/**
@@ -306,16 +295,9 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * This constructor:
-			 *        -# entails \f$ \Theta(1) \f$ amount of work;
-			 *        -# moves \f$ \Theta(1) \f$ bytes of data within its user process;
-			 *        -# moves \f$ 0 \f$ bytes of data between user processes;
-			 *        -# shall \em not require synchronisations between user processes;
-			 *        -# inherit the memory usage of \a other;
-			 *        -# will \em not make system calls and in particular will not free
-			 *           nor allocate dynamic memory.
-			 *
-			 * Additionally, the backend-specific cost of the matrix destructor apply.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 */
 			self_type& operator=( self_type &&other ) noexcept {
@@ -328,15 +310,9 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * Beckends must define cost semantics across the following
-			 * dimensions:
-			 *   -# work;
-			 *   -# intra-process data movement;
-			 *   -# inter-process data movement;
-			 *   -# inter-process synchronisations;
-			 *   -# memory usage; and
-			 *   -# whether system calls, in particular dynamic memory management calls,
-			 *      could occur.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 *
 			 * \warning Avoid calling destructors from within performance critical
@@ -358,15 +334,9 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * Beckends must define cost semantics across the following
-			 * dimensions:
-			 *   -# work;
-			 *   -# intra-process data movement;
-			 *   -# inter-process data movement;
-			 *   -# inter-process synchronisations;
-			 *   -# memory usage; and
-			 *   -# whether system calls, in particular dynamic memory management calls,
-			 *      could occur.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 *
 			 * \note This function may make use of a const_iterator that is buffered,
@@ -394,15 +364,9 @@ namespace grb {
 			 *
 			 * \parblock
 			 * \par Performance semantics.
-			 * Beckends must define cost semantics across the following
-			 * dimensions:
-			 *   -# work;
-			 *   -# intra-process data movement;
-			 *   -# inter-process data movement;
-			 *   -# inter-process synchronisations;
-			 *   -# memory usage; and
-			 *   -# whether system calls, in particular dynamic memory management calls,
-			 *      could occur.
+			 * Each backend must define performance semantics for this primitive.
+			 *
+			 * @see perfSemantics
 			 * \endparblock
 			 *
 			 * \note Even if cbegin() returns a buffered const_iterator that may require
diff --git a/include/graphblas/base/pinnedvector.hpp b/include/graphblas/base/pinnedvector.hpp
index 724e5712f..d13ffa022 100644
--- a/include/graphblas/base/pinnedvector.hpp
+++ b/include/graphblas/base/pinnedvector.hpp
@@ -18,7 +18,7 @@
 /**
  * @file
  *
- * Contains the API for the PinnedVector class.
+ * Contains the specification for #grb::PinnedVector.
  *
  * @author A. N. Yzelman
  */
@@ -36,41 +36,40 @@
 
 namespace grb {
 
-	/** \addtogroup IO
+	/**
+	 * Provides a mechanism to access ALP containers from outside of an ALP
+	 * context.
 	 *
-	 * Provides a mechanism to access GraphBLAS containers from outside of any
-	 * GraphBLAS context.
-	 *
-	 * An instance of \a PinnedVector caches a container's data and returns it
+	 * An instance of #grb::PinnedVector caches a container's data and returns it
 	 * to the user. The user can refer to the returned data until such time the
-	 * \a PinnedVector's instance is destroyed, regardless of whether a call to
+	 * instance of #grb::PinnedVector is destroyed, regardless of whether a call to
 	 * #grb::finalize occurs, and regardless whether the ALP/GraphBLAS program
 	 * executed through the #grb::Launcher had already returned.
 	 *
 	 * The original container may not be modified or any derived instance of
 	 * \a PinnedVector shall become invalid.
 	 *
-	 * \note It would be strange if a GraphBLAS container a pinned vector is
+	 * \note It would be strange if an ALP/GraphBLAS container a pinned vector is
 	 *       derived from persists-- pinned vectors are designed to be used
 	 *       precisely when the original container no longer is in scope.
 	 *       Therefore this last remark on invalidation should not matter.
 	 *
-	 * The PinnedVector abstracts a container over nonzeroes. A nonzero is a pair
-	 * of indices and values. One may query for the number of nonzeroes and use
-	 *   1. #getNonzeroValue to retrieve a nonzero value, or
-	 *   2. #getNonzeroIndex to retrieve a nonzero index.
+	 * The #grb::PinnedVector abstracts a read-only container of nonzeroes. A
+	 * nonzero is a pair of indices and values. One may query for the number of
+	 * nonzeroes and use
+	 *   1. getNonzeroValue to retrieve a nonzero value, or
+	 *   2. getNonzeroIndex to retrieve a nonzero index.
 	 *
-	 * An instance of the PinnedVector cannot modify the underlying nonzero
-	 * structure nor its values.
+	 * An instance of #grb::PinnedVector cannot modify the underlying nonzero
+	 * structure nor can it modify its values.
 	 *
 	 * \note A performant implementation in fact does \em not copy the container
-	 *       data, but provides a mechanism to access the underlying GraphBLAS
-	 *       memory whenever it is possible to do so. This memory should remain
-	 *       valid even after a call to grb::finalize() is made, and for as long
-	 *       as the \a PinnedVector instance remains valid.
+	 *       data, but provides a mechanism to access the underlying ALP memory
+	 *       whenever it is possible to do so. This memory should remain valid
+	 *       even after a call to #grb::Launcher::exec has completed, and for as
+	 *       long as the #grb::PinnedVector instance remains valid.
 	 *
-	 * \note Some implementations may not retain a raw vector. In this case, a
-	 *       copy is unavoidable.
+	 * \ingroup IO
 	 */
 	template< typename IOType, enum Backend implementation >
 	class PinnedVector {
@@ -78,8 +77,7 @@ namespace grb {
 		private :
 
 			/**
-			 * \internal Dummy false bool with a descriptive name for assertion
-			 * failures.
+			 * Dummy false bool with a descriptive name for assertion failures.
 			 */
 			static const constexpr bool
 				function_was_not_implemented_in_the_selected_backend = false;
@@ -88,25 +86,27 @@ namespace grb {
 		public :
 
 			/**
-			 * Pins a given \a vector to a single memory pointer. The pointer
-			 * shall remain valid as long as the lifetime of this instance.
-			 * The given \a vector must be in unpinned state or an exception
-			 * will be thrown.
-			 * Pinning may or may not require a memory copy, depending on the
-			 * GraphBLAS implementation. If it does not, then destroying this
-			 * instance or calling #free on this vector may or may not result
-			 * in memory deallocation, depending on whether the underlying
-			 * vector still exists or not.
+			 * Pins the contents of a given \a vector.
+			 *
+			 * A successfully constructed #grb::PinnedVector shall remain valid until it
+			 * is destroyed, regardless of whether the ALP context in which the original
+			 * \a vector appears has been destroyed.
+			 *
+			 * Pinning may or may not require a memory copy, depending on the ALP
+			 * implementation and backend. If it does not, then destroying this
+			 * instance \em may result in memory deallocation. It only \em must result
+			 * in deallocation if the pinned vector that did not require a memory copy
+			 * happens to be the last remaining reference to the original \a vector.
 			 *
-			 * If one user process calls this constructor, \em all user
-			 * processes must do so-- this is a collective call. All member
-			 * functions as well as the default destructor are \em not
-			 * collective.
+			 * If one user process calls this constructor, \em all user processes must do
+			 * so and with the same arguments-- this is a collective call.
+			 *
+			 * All member functions of this class are \em not collective.
 			 *
 			 * @param[in] vector The vector to pin the memory of.
-			 * @param[in]  mode  The grb::IOMode.
+			 * @param[in]  mode  The #grb::IOMode.
 			 *
-			 * The \a mode argument is \em optional; its default is PARALLEL.
+			 * The \a mode argument is \em optional. The default is #grb::PARALLEL.
 			 *
 			 * \parblock
 			 * \par Performance semantics (#IOMode::SEQUENTIAL):
@@ -135,8 +135,8 @@ namespace grb {
 				const Vector< IOType, implementation, Coord > &vector,
 				const IOMode mode
 			) {
-				(void)vector;
-				(void)mode;
+				(void) vector;
+				(void) mode;
 				assert( function_was_not_implemented_in_the_selected_backend );
 			}
 
@@ -155,6 +155,8 @@ namespace grb {
 			}
 
 			/**
+			 * Destroys a #grb::PinnedVector instance.
+			 *
 			 * Destroying a pinned vector will only remove the underlying vector data if
 			 * and only if:
 			 *   1) the original grb::Vector has been destroyed;
@@ -216,7 +218,7 @@ namespace grb {
 			 * optional.
 			 *
 			 * A nonzero is a tuple of an index and nonzero value. A pinned vector holds
-			 * #nonzeroes() nonzeroes. Therefore, \a k must be less than #nonzeroes().
+			 * #nonzeroes nonzeroes. Therefore, \a k must be less than #nonzeroes.
 			 *
 			 * @return The requested value.
 			 *
@@ -231,7 +233,7 @@ namespace grb {
 			inline OutputType getNonzeroValue(
 				const size_t k, const OutputType one = OutputType()
 			) const noexcept {
-				(void)k;
+				(void) k;
 				assert( function_was_not_implemented_in_the_selected_backend );
 				return one;
 			}
@@ -249,14 +251,13 @@ namespace grb {
 			 * specification of #getNonzeroValue.
 			 *
 			 * \note By providing this variant, implementations may avoid the
-			 *       requirement thatensure that that \a IOType must be default-
-			 *       constructable.
+			 *       requirement that \a IOType must be default-constructable.
 			 */
 			inline IOType getNonzeroValue(
 				const size_t k
 			) const noexcept {
 				IOType ret;
-				(void)k;
+				(void) k;
 				assert( function_was_not_implemented_in_the_selected_backend );
 				return ret;
 			}
@@ -267,7 +268,7 @@ namespace grb {
 			 * @param[in] k The nonzero ID to return the index of.
 			 *
 			 * A nonzero is a tuple of an index and nonzero value. A pinned vector holds
-			 * #nonzeroes() nonzeroes. Therefore, \a k must be less than #nonzeroes().
+			 * #nonzeroes nonzeroes. Therefore, \a k must be less than #nonzeroes.
 			 *
 			 * @return The requested index.
 			 *
@@ -281,7 +282,7 @@ namespace grb {
 			inline size_t getNonzeroIndex(
 				const size_t k
 			) const noexcept {
-				(void)k;
+				(void) k;
 				assert( function_was_not_implemented_in_the_selected_backend );
 				return std::numeric_limits< size_t >::max();
 			}
diff --git a/include/graphblas/base/properties.hpp b/include/graphblas/base/properties.hpp
index a1b497031..01a649203 100644
--- a/include/graphblas/base/properties.hpp
+++ b/include/graphblas/base/properties.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides a mechanism for inspecting properties of various backends
+ *
  * @author A. N. Yzelman
  * @date 5th of May 2017
  */
@@ -29,40 +33,71 @@
 namespace grb {
 
 	/**
-	 * Collection of various properties on the given GraphBLAS backend.
+	 * Collection of various properties on the given ALP/GraphBLAS \a backend.
 	 *
-	 * @tparam implementation The implementation of which to access its properties.
+	 * @tparam backend The backend of which to access its properties.
 	 *
 	 * The properties collected here are meant to be compile-time constants that
-	 * provide insight in what features the back-end supports.
+	 * provide insight in what features the given \a backend supports. ALP user
+	 * code may rely on the properties specified herein. All ALP backends must
+	 * define all properties here specified.
+	 *
+	 * The default template class shall be empty in order to ensure implementing
+	 * backends must specialise this class, while also making sure no backend may
+	 * accidentally implicitly and erroneously propagate global defaults.
+	 *
+	 * \ingroup backends
 	 */
-	template< enum Backend implementation >
+	template< enum Backend backend >
 	class Properties {
 
+#ifdef __DOXYGEN__
+
 		public:
 
 			/**
-			 * Whether a non-GraphBLAS object captured by a lambda-function and passed to
-			 * grb::eWiseLambda can be written to.
-			 *
-			 * If the implementation backend is fully Single Program, Multiple Data
-			 * (SPMD), then this is expected to be legal and result in user-process local
-			 * updates. This function would thus return \a true.
+			 * Whether a scalar, non-ALP/GraphBLAS object, may be captured by and written
+			 * to by a lambda function that is passed to #grb::eWiseLambda.
 			 *
-			 * If the implementaiton backend is parallel but supports only a single user
-			 * processes, i.e., for a \em data-centric backend, writing to a shared
-			 * object results in race conditions and thus is technically impossible. This
-			 * function would thus return \a false.
+			 * Typically, if the \a backend is shared-memory parallel, this function
+			 * would return <tt>false</tt>. Purely Single Program, Multiple Data (SPMD)
+			 * backends over distributed memory, including simple sequential backends,
+			 * would have this property return <tt>true</tt>.
 			 *
-			 * @return A boolean \a true if and only if capturing a non-GraphBLAS object
-			 *         inside a lambda-function for write access, and passing it to
-			 *         grb::eWiseLambda would yield valid user process local results. If
-			 *         not, \a false is returned instead.
+			 * Notably, hybrid SPMD + OpenMP backends (e.g., #grb::hybrid), are not pure
+			 * SPMD and as such would return <tt>false</tt>.
 			 *
 			 * @see grb::eWiseLambda()
 			 */
-			static constexpr bool writableCaptured = true;
+			static constexpr const bool writableCaptured = true;
 
+			/**
+			 * Whether the given \a backend supports blocking execution or is, instead,
+			 * non-blocking.
+			 *
+			 * In blocking execution mode, any ALP/GraphBLAS primitive, when it returns,
+			 * is guaranteed to have completed the requested computation.
+			 *
+			 * If a given \a backend has this property <tt>true</tt> then the
+			 * #isNonblockingExecution property must read <tt>false</tt>, and vice versa.
+			 */
+			static constexpr const bool isBlockingExecution = true;
+
+			/**
+			 * Whether the given \a backend is non-blocking or is, instead, blocking.
+			 *
+			 * In non-blocking execution mode, any ALP/GraphBLAS primitive, on return,
+			 * \em may in fact \em not have completed the requested computation.
+			 *
+			 * Non-blocking execution thus allows for the lazy evaluation of an ALP
+			 * code, which, in turn, allows for cross-primitive optimisations to be
+			 * automatically applied.
+			 *
+			 * If a given \a backend has this property <tt>true</tt> then the
+			 * #isBlockingExecution property must read <tt>false</tt>, and vice versa.
+			 */
+			static constexpr const bool isNonblockingExecution = !isBlockingExecution;
+#endif
 		};
 
 } // namespace grb
diff --git a/include/graphblas/base/spmd.hpp b/include/graphblas/base/spmd.hpp
index 49b95b90d..1955c8199 100644
--- a/include/graphblas/base/spmd.hpp
+++ b/include/graphblas/base/spmd.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Exposes facilities for direct SPMD programming
+ *
  * @author A. N. Yzelman
  * @date 28th of April, 2017
  */
@@ -32,52 +36,67 @@
 
 #include "config.hpp"
 
+
 namespace grb {
 
-	/** \todo documentation */
+	/**
+	 * For backends that support multiple user processes this class defines some
+	 * basic primitives to support SPMD programming.
+	 *
+	 * All backends must implement this interface, including backends that do not
+	 * support multiple user processes. The interface herein defined hence ensures
+	 * to allow for trivial implementations for single user process backends.
+	 */
 	template< Backend implementation >
 	class spmd {
 
-	public:
-
-		/** @return The number of user processes in this GraphBLAS run. */
-		static inline size_t nprocs() noexcept {
-			return 0;
-		}
-
-		/** @return The user process ID. */
-		static inline size_t pid() noexcept {
-			return SIZE_MAX;
-		}
-
-		/**
-		 * Calls a PlatformBSP \a bsp_sync.
-		 *
-		 * @param[in] msgs_in  The maximum number of messages to be received across
-		 *                     \em all user processes. Default is zero.
-		 * @param[in] msgs_out The maximum number of messages to be sent across
-		 *                     \em all user processes. Default is zero.
-		 *
-		 * If both \a msgs_in and \a msgs_out are zero, the values will be
-		 * automatically inferred. This requires a second call to the PlatformBSP
-		 * \a bsp_sync primitive, thus increasing the latency by at least \f$ l \f$.
-		 *
-		 * If the values for \a msgs_in or \a msgs_out are underestimated, undefined
-		 * behaviour will occur. If this is not the case but one or more are instead
-		 * \a over estimated, this call will succeed as normal.
-		 *
-		 * @return grb::SUCCESS When all queued communication is executed succesfully.
-		 * @return grb::PANIC   When an unrecoverable error occurs. When this value is
-		 *                      returned, the library enters an undefined state.
-		 */
-		static enum RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
-			(void)msgs_in;
-			(void)msgs_out;
-			return PANIC;
-		}
+		public:
+
+			/** @return The number of user processes in this ALP run. */
+			static inline size_t nprocs() noexcept {
+				return 0;
+			}
+
+			/** @return The ID of this user process. */
+			static inline size_t pid() noexcept {
+				return SIZE_MAX;
+			}
+
+			/**
+			 * \internal
+			 * Provides functionalities similar to the LPF primitive \a lpf_sync,
+			 * enhanced with zero-cost synchronisation semantics.
+			 *
+			 * @param[in] msgs_in  The maximum number of messages to be received across
+			 *                     \em all user processes. Default is zero.
+			 * @param[in] msgs_out The maximum number of messages to be sent across
+			 *                     \em all user processes. Default is zero.
+			 *
+			 * If both \a msgs_in and \a msgs_out are zero, the values will be
+			 * automatically inferred. This requires a second call to the PlatformBSP
+			 * \a bsp_sync primitive, thus increasing the latency by at least \f$ l \f$.
+			 *
+			 * If the values for \a msgs_in or \a msgs_out are underestimated, undefined
+			 * behaviour will occur. If this is not the case but one or more are instead
+			 * \a over estimated, this call will succeed as normal.
+			 *
+			 * @return grb::SUCCESS When all queued communication is executed succesfully.
+			 * @return grb::PANIC   When an unrecoverable error occurs. When this value is
+			 *                      returned, the library enters an undefined state.
+			 *
+			 * \todo If exposing this API, there should also be exposed a mechanism for
+			 *       initiating messages.
+			 * \endinternal
+			 */
+			static enum RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
+				(void) msgs_in;
+				(void) msgs_out;
+				return PANIC;
+			}
 
 	}; // end class ``spmd''
 
 } // namespace grb
 
 #endif // end _H_GRB_BASE_SPMD
+
diff --git a/include/graphblas/base/vector.hpp b/include/graphblas/base/vector.hpp
index 3d3e2c2e5..c00ca6e53 100644
--- a/include/graphblas/base/vector.hpp
+++ b/include/graphblas/base/vector.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies the ALP/GraphBLAS vector container.
+ *
  * @author A. N. Yzelman
  * @date 10th of August, 2016
  */
@@ -33,6 +37,7 @@
 #include <graphblas/ops.hpp>
 #include <graphblas/rc.hpp>
 
+
 namespace grb {
 
 	/**
@@ -143,14 +148,14 @@ namespace grb {
 				public :
 
 					/** Standard equals operator. */
-					bool operator==( const const_iterator & other ) const {
-						(void)other;
+					bool operator==( const const_iterator &other ) const {
+						(void) other;
 						return false;
 					}
 
 					/** @returns The negation of operator==(). */
-					bool operator!=( const const_iterator & other ) const {
-						(void)other;
+					bool operator!=( const const_iterator &other ) const {
+						(void) other;
 						return true;
 					}
 
@@ -219,8 +224,8 @@ namespace grb {
 			 *          code sections.
 			 */
 			Vector( const size_t n, const size_t nz ) {
-				(void)n;
-				(void)nz;
+				(void) n;
+				(void) nz;
 			}
 
 			/**
@@ -228,7 +233,7 @@ namespace grb {
 			 * above where \a nz is to taken equal to \a n.
 			 */
 			Vector( const size_t n ) {
-				(void)n;
+				(void) n;
 			}
 
 			/**
@@ -252,7 +257,7 @@ namespace grb {
 			 * \endparblock
 			 */
 			Vector( Vector< D, implementation, C > &&x ) noexcept {
-				(void)x;
+				(void) x;
 			}
 
 			/**
@@ -270,8 +275,10 @@ namespace grb {
 			 *         -# this move assignment moves \f$ \Theta(1) \f$ data only.
 			 * \endparblock
 			 */
-			Vector< D, implementation, C >& operator=( Vector< D, implementation, C > &&x ) noexcept {
-				(void)x;
+			Vector< D, implementation, C >& operator=(
+				Vector< D, implementation, C > &&x
+			) noexcept {
+				(void) x;
 				return *this;
 			}
 
@@ -330,7 +337,10 @@ namespace grb {
 			 *       hence possibly causing its implicitly called constructor to
 			 *       allocate dynamic memory.
 			 */
-			const_iterator cbegin() const {}
+			const_iterator cbegin() const {
+				const_iterator ret;
+				return ret;
+			}
 
 			/**
 			 * Same as cbegin().
@@ -338,7 +348,11 @@ namespace grb {
 			 * is no overloaded version of this function that returns a non-const
 			 * iterator.
 			 */
-			const_iterator begin() const {}
+			const_iterator begin() const {
+				const_iterator ret;
+				return ret;
+			}
+
 			//@}
 
 			//@{
@@ -363,7 +377,10 @@ namespace grb {
 			 *       specification disallows the same to happen for the construction of
 			 *       an iterator in end position.
 			 */
-			const_iterator cend() const {}
+			const_iterator cend() const {
+				const_iterator ret;
+				return ret;
+			}
 
 			/**
 			 * Same as cend().
@@ -371,7 +388,10 @@ namespace grb {
 			 * is no overloaded version of this function that returns a non-const
 			 * iterator.
 			 */
-			const_iterator end() const {}
+			const_iterator end() const {
+				const_iterator ret;
+				return ret;
+			}
 			//@}
 
 			/**
@@ -457,12 +477,20 @@ namespace grb {
 			 * @see grb::buildVector for the GraphBLAS standard dispatcher to this
 			 *                       function.
 			 */
-			template< Descriptor descr = descriptors::no_operation, class Accum = typename operators::right_assign< D, D, D >, typename fwd_iterator = const D * __restrict__ >
-			RC build( const Accum & accum, const fwd_iterator start, const fwd_iterator end, fwd_iterator npos ) {
-				(void)accum;
-				(void)start;
-				(void)end;
-				(void)npos;
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Accum = typename operators::right_assign< D, D, D >,
+				typename fwd_iterator = const D * __restrict__
+			>
+			RC build(
+				const Accum &accum,
+				const fwd_iterator start, const fwd_iterator end,
+				fwd_iterator npos
+			) {
+				(void) accum;
+				(void) start;
+				(void) end;
+				(void) npos;
 				return PANIC;
 			}
 
@@ -564,18 +592,25 @@ namespace grb {
 			 * @see grb::buildVector for the GraphBLAS standard dispatcher to this
 			 *                       function.
 			 */
-			template< Descriptor descr = descriptors::no_operation,
+			template<
+				Descriptor descr = descriptors::no_operation,
 				class Accum = operators::right_assign< D, D, D >,
 				typename ind_iterator = const size_t * __restrict__,
 				typename nnz_iterator = const D * __restrict__,
-				class Dup = operators::right_assign< D, D, D > >
-			RC build( const Accum & accum, const ind_iterator ind_start, const ind_iterator ind_end, const nnz_iterator nnz_start, const nnz_iterator nnz_end, const Dup & dup = Dup() ) {
-				(void)accum;
-				(void)ind_start;
-				(void)ind_end;
-				(void)nnz_start;
-				(void)nnz_end;
-				(void)dup;
+				class Dup = operators::right_assign< D, D, D >
+			>
+			RC build(
+				const Accum &accum,
+				const ind_iterator ind_start, const ind_iterator ind_end,
+				const nnz_iterator nnz_start, const nnz_iterator nnz_end,
+				const Dup &dup = Dup()
+			) {
+				(void) accum;
+				(void) ind_start;
+				(void) ind_end;
+				(void) nnz_start;
+				(void) nnz_end;
+				(void) dup;
 				return PANIC;
 			}
 
@@ -682,26 +717,30 @@ namespace grb {
 			 * @see grb::buildVector for the GraphBLAS standard dispatcher to this
 			 *                       function.
 			 */
-			template< Descriptor descr = descriptors::no_operation,
+			template<
+				Descriptor descr = descriptors::no_operation,
 				typename mask_type,
 				class Accum,
 				typename ind_iterator = const size_t * __restrict__,
 				typename nnz_iterator = const D * __restrict__,
-				class Dup = operators::right_assign< D, typename nnz_iterator::value_type, D > >
-			RC build( const Vector< mask_type, implementation, C > mask,
-				const Accum & accum,
+				class Dup = operators::right_assign< D, typename nnz_iterator::value_type, D >
+			>
+			RC build(
+				const Vector< mask_type, implementation, C > &mask,
+				const Accum &accum,
 				const ind_iterator ind_start,
 				const ind_iterator ind_end,
 				const nnz_iterator nnz_start,
 				const nnz_iterator nnz_end,
-				const Dup & dup = Dup() ) {
-				(void)mask;
-				(void)accum;
-				(void)ind_start;
-				(void)ind_end;
-				(void)nnz_start;
-				(void)nnz_end;
-				(void)dup;
+				const Dup &dup = Dup()
+			) {
+				(void) mask;
+				(void) accum;
+				(void) ind_start;
+				(void) ind_end;
+				(void) nnz_start;
+				(void) nnz_end;
+				(void) dup;
 				return PANIC;
 			}
 
@@ -730,8 +769,8 @@ namespace grb {
 			 * \endparblock
 			 */
 			template< typename T >
-			RC size( T & size ) const {
-				(void)size;
+			RC size( T &size ) const {
+				(void) size;
 				return PANIC;
 			}
 
@@ -760,8 +799,8 @@ namespace grb {
 			 * \endparblock
 			 */
 			template< typename T >
-			RC nnz( T & nnz ) const {
-				(void)nnz;
+			RC nnz( T &nnz ) const {
+				(void) nnz;
 				return PANIC;
 			}
 
@@ -825,22 +864,36 @@ namespace grb {
 			 *      #lambda_reference.
 			 */
 			template< class Monoid >
-			lambda_reference operator()( const size_t i, const Monoid & monoid = Monoid() ) {
-				(void)i;
-				(void)monoid;
+			lambda_reference operator()(
+				const size_t i, const Monoid &monoid = Monoid()
+			) {
+				(void) i;
+				(void) monoid;
 				return PANIC;
 			}
 
 			/**
-			 * Returns a lambda reference to an element of this vector. The user
-			 * ensures that the requested reference only corresponds to a pre-existing
-			 * nonzero in this vector, <em>or undefined behaviour will occur</em>.
+			 * Returns a lambda reference to an element of this vector.
+			 *
+			 * \warning This functionality may only be used within the body of a lambda
+			 *          function that is passed into #grb::eWiseLambda.
+			 *
+			 * The user must ensure that the requested reference only corresponds to a
+			 * pre-existing nonzero in this vector.
+			 *
+			 * \warning Requesting a nonzero entry at a coordinate where no nonzero
+			 *          exists results in undefined behaviour.
 			 *
 			 * A lambda reference to an element of this vector is only valid when used
 			 * inside a lambda function evaluated via grb::eWiseLambda. The lambda
-			 * function is called for specific indices only-- that is, the GraphBLAS
-			 * implementation decides at which elements to dereference this container.
-			 * Outside this scope the returned reference incurs undefined behaviour.
+			 * function is called for specific indices only-- that is, ALP/GraphBLAS
+			 * decides at which elements to dereference this container.
+			 *
+			 * If such a lambda function dereferences multiple vectors, then the sparsity
+			 * structure of the first vector passed as an argument to #grb::eWiseLambda
+			 * after the lambda function defines at which indices the vectors will be
+			 * referenced. The user must ensure that all vectors dereferenced indeed have
+			 * nonzeroes at every location this "leading vector" has a nonzero.
 			 *
 			 * \warning In particular, for the given index \a i by the lambda function,
 			 *          it shall be \em illegal to refer to indices relative to that
@@ -848,59 +901,32 @@ namespace grb {
 			 *          cetera.
 			 *
 			 * \note    As a consequence, this function cannot be used to perform stencil
-			 *          or halo based operations.
-			 *
-			 * If a previously non-existing entry of the vector is requested, undefined
-			 * behaviour will occur. Functions that are defined to work with references
-			 * of this kind, such as grb::eWiseLambda, define exactly which elements are
-			 * dereferenced.
-			 *
-			 * \warning In parallel contexts the use of a returned lambda reference
-			 *          outside the context of an eWiseLambda will incur at least one of
-			 *          the following ill effects: it may
-			 *            -# fail outright,
-			 *            -# work on stale data,
-			 *            -# work on incorrect data, or
-			 *            -# incur high communication costs to guarantee correctness.
-			 *          In short, such usage causes undefined behaviour. Implementers are
-			 *          \em not advised to provide GAS-like functionality through this
-			 *          interface, as it invites bad programming practices and bad
-			 *          algorithm design decisions. This operator is instead intended to
-			 *          provide for generic BLAS1-type operations only.
+			 *          or halo type operations.
 			 *
-			 * \note    For I/O, use the iterator retrieved via cbegin() instead of
-			 *          relying on a lambda_reference.
-			 *
-			 * @param[in] i    Which element to return a lambda reference of.
-			 * @param[in] ring Under which generalised semiring to interpret the
-			 *                 requested \f$ i \f$th element of this vector.
+			 * \note    For I/O purposes, use the iterator retrieved via cbegin()
+			 *          instead of relying on a lambda_reference.
 			 *
-			 * \note The \a ring is required to be able to interpret a sparse vector. A
-			 *       user who is sure this vector is dense, or otherwise is able to
-			 *       ensure that the a lambda_reference will only be requested at
-			 *       elements where nonzeroes already exists, may refer to
-			 *       Vector::operator[],
+			 * @param[in] i Which element to return a lambda reference of.
 			 *
 			 * @return A lambda reference to the element \a i of this vector.
 			 *
 			 * \par Example.
-			 * See grb::eWiseLambda() for a practical and useful example.
-			 *
-			 * \warning There is no similar concept in the official GraphBLAS specs.
+			 * See #grb::eWiseLambda for a practical and useful example.
 			 *
 			 * @see lambda_reference For more details on the returned reference type.
-			 * @see grb::eWiseLambda For one legal way in which to use the returned
-			 *      #lambda_reference.
+			 * @see #grb::eWiseLambda For one way to use the returned #lambda_reference.
 			 */
 			lambda_reference operator[]( const size_t i ) {
-				(void)i;
-			#ifndef _GRB_NO_EXCEPTIONS
-				throw std::runtime_error( "Requesting lambda reference of unimplemented "
-										  "Vector backend." );
-			#endif
+				(void) i;
+ #ifndef _GRB_NO_EXCEPTIONS
+				throw std::runtime_error(
+					"Requesting lambda reference of unimplemented Vector backend."
+				);
+ #endif
 			}
-}
-;
-}
+	};
+
+} // end namespace ``grb''
 
 #endif // _H_GRB_VECTOR_BASE
+
diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp
index b0187a0b9..ccace7979 100644
--- a/include/graphblas/benchmark.hpp
+++ b/include/graphblas/benchmark.hpp
@@ -28,13 +28,19 @@
 
 // include specialisations
 #ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/benchmark.hpp"
+ #include "graphblas/reference/benchmark.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/benchmark.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/benchmark.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/benchmark.hpp"
+ #include "graphblas/banshee/benchmark.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/benchmark.hpp"
+ #include "graphblas/bsp1d/benchmark.hpp"
 #endif
 
 #ifdef _GRB_BACKEND
@@ -45,3 +51,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_BENCH''
+
diff --git a/include/graphblas/blas0.hpp b/include/graphblas/blas0.hpp
index b0967a322..751b2cf14 100644
--- a/include/graphblas/blas0.hpp
+++ b/include/graphblas/blas0.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-0 API
+ *
  * @author A. N. Yzelman
  * @date 5th of December 2016
  */
@@ -55,10 +59,12 @@
 		"************************************************************************" \
 		"**********************\n" );
 
+
 namespace grb {
 
 	/**
-	 * \defgroup BLAS0 The Level-0 Basic Linear Algebra Subroutines (BLAS)
+	 * \defgroup BLAS0 Level-0 Primitives
+	 * \ingroup GraphBLAS
 	 *
 	 * A collection of functions that let GraphBLAS operators work on
 	 * zero-dimensional containers, i.e., on scalars.
@@ -165,11 +171,13 @@ namespace grb {
 	 * @see grb::operators::internal::Operator for a discussion on when foldr and
 	 *      foldl successfully generate in-place code.
 	 */
-	template< Descriptor descr = descriptors::no_operation,
+	template<
+		Descriptor descr = descriptors::no_operation,
 		class OP,
 		typename InputType1, typename InputType2, typename OutputType
 	>
-	static enum RC apply( OutputType &out,
+	static enum RC apply(
+		OutputType &out,
 		const InputType1 &x,
 		const InputType2 &y,
 		const OP &op = OP(),
@@ -178,7 +186,7 @@ namespace grb {
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			!grb::is_object< OutputType >::value,
-		void >::type * = NULL
+		void >::type * = nullptr
 	) {
 		// static sanity check
 		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || (
@@ -272,15 +280,26 @@ namespace grb {
 	 * @see grb::operators::internal Operator for a discussion on fold-right
 	 *      capable operators and on stateful operators.
 	 */
-	template< Descriptor descr = descriptors::no_operation, class OP, typename InputType, typename IOType >
-	static RC foldr( const InputType & x,
-		IOType & y,
-		const OP & op = OP(),
-		const typename std::enable_if< grb::is_operator< OP >::value && ! grb::is_object< InputType >::value && ! grb::is_object< IOType >::value, void >::type * = NULL ) {
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, typename InputType, typename IOType
+	>
+	static RC foldr(
+		const InputType &x,
+		IOType &y,
+		const OP &op = OP(),
+		const typename std::enable_if<
+			grb::is_operator< OP >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value, void
+		>::type * = nullptr
+	) {
 		// static sanity check
-		NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) ||
-							( std::is_same< InputType, typename OP::D1 >::value && std::is_same< IOType, typename OP::D2 >::value && std::is_same< IOType, typename OP::D3 >::value ) ),
-			"grb::foldr (BLAS level 0)",
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || (
+				std::is_same< InputType, typename OP::D1 >::value &&
+				std::is_same< IOType, typename OP::D2 >::value &&
+				std::is_same< IOType, typename OP::D3 >::value
+			) ), "grb::foldr (BLAS level 0)",
 			"Argument value types do not match operator domains while no_casting "
 			"descriptor was set" );
 
@@ -364,8 +383,13 @@ namespace grb {
 	 * @see grb::operators::internal Operator for a discussion on fold-right
 	 *      capable operators and on stateful operators.
 	 */
-	template< Descriptor descr = descriptors::no_operation, class OP, typename InputType, typename IOType >
-	static RC foldl( IOType &x,
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename InputType, typename IOType
+	>
+	static RC foldl(
+		IOType &x,
 		const InputType &y,
 		const OP &op = OP(),
 		const typename std::enable_if< grb::is_operator< OP >::value &&
@@ -410,46 +434,89 @@ namespace grb {
 		 * @tparam Enabled    Controls, through SFINAE, whether the use of the
 		 *                    #use_index descriptor is allowed at all.
 		 */
-		template< grb::Descriptor descr, typename OutputType, typename D, typename Enabled = void >
+		template<
+			grb::Descriptor descr,
+			typename OutputType, typename D,
+			typename Enabled = void
+		>
 		class ValueOrIndex;
 
 		/* Version where use_index is allowed. */
 		template< grb::Descriptor descr, typename OutputType, typename D >
-		class ValueOrIndex< descr, OutputType, D, typename std::enable_if< std::is_arithmetic< OutputType >::value && ! std::is_same< D, void >::value >::type > {
-		private:
-			static constexpr const bool use_index = descr & grb::descriptors::use_index;
-			static_assert( use_index || std::is_convertible< D, OutputType >::value, "Cannot convert to the requested output type" );
-
-		public:
-			static OutputType getFromArray( const D * __restrict__ const x, const std::function< size_t( size_t ) > & src_local_to_global, const size_t index ) noexcept {
-				if( use_index ) {
-					return static_cast< OutputType >( src_local_to_global( index ) );
-				} else {
-					return static_cast< OutputType >( x[ index ] );
+		class ValueOrIndex<
+			descr,
+			OutputType, D,
+			typename std::enable_if<
+				std::is_arithmetic< OutputType >::value &&
+				!std::is_same< D, void >::value
+			>::type
+		> {
+
+			private:
+
+				static constexpr const bool use_index = descr & grb::descriptors::use_index;
+
+				static_assert( use_index || std::is_convertible< D, OutputType >::value,
+					"Cannot convert to the requested output type" );
+
+
+			public:
+
+				static OutputType getFromArray(
+					const D * __restrict__ const x,
+					const std::function< size_t( size_t ) > &src_local_to_global,
+					const size_t index
+				) noexcept {
+					if( use_index ) {
+						return static_cast< OutputType >( src_local_to_global( index ) );
+					} else {
+						return static_cast< OutputType >( x[ index ] );
+					}
 				}
-			}
-			static OutputType getFromScalar( const D &x, const size_t index ) noexcept {
-				if( use_index ) {
-					return static_cast< OutputType >( index );
-				} else {
-					return static_cast< OutputType >( x );
+
+				static OutputType getFromScalar( const D &x, const size_t index ) noexcept {
+					if( use_index ) {
+						return static_cast< OutputType >( index );
+					} else {
+						return static_cast< OutputType >( x );
+					}
 				}
-			}
+
 		};
 
 		/* Version where use_index is not allowed. */
 		template< grb::Descriptor descr, typename OutputType, typename D >
-		class ValueOrIndex< descr, OutputType, D, typename std::enable_if< ! std::is_arithmetic< OutputType >::value && ! std::is_same< OutputType, void >::value >::type > {
-			static_assert( ! ( descr & descriptors::use_index ), "use_index descriptor given while output type is not numeric" );
-			static_assert( std::is_convertible< D, OutputType >::value, "Cannot convert input to the given output type" );
-
-		public:
-			static OutputType getFromArray( const D * __restrict__ const x, const std::function< size_t( size_t ) > &, const size_t index ) noexcept {
-				return static_cast< OutputType >( x[ index ] );
-			}
-			static OutputType getFromScalar( const D &x, const size_t ) noexcept {
-				return static_cast< OutputType >( x );
-			}
+		class ValueOrIndex<
+			descr,
+			OutputType, D,
+			typename std::enable_if<
+				!std::is_arithmetic< OutputType >::value &&
+				!std::is_same< OutputType, void >::value
+			>::type
+		> {
+
+			static_assert( !(descr & descriptors::use_index),
+				"use_index descriptor given while output type is not numeric" );
+
+			static_assert( std::is_convertible< D, OutputType >::value,
+				"Cannot convert input to the given output type" );
+
+			public:
+
+				static OutputType getFromArray(
+					const D * __restrict__ const x,
+					const std::function< size_t( size_t ) > &,
+					const size_t index
+				) noexcept {
+					return static_cast< OutputType >( x[ index ] );
+				}
+
+				static OutputType getFromScalar(
+					const D &x, const size_t
+				) noexcept {
+					return static_cast< OutputType >( x );
+				}
+
 		};
 
 		/**
@@ -472,32 +539,69 @@ namespace grb {
 		 *                    operator version is used instead.
 		 */
 
-		template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity, typename Enabled = void >
+		template<
+			bool identity_left,
+			typename OutputType, typename InputType,
+			template< typename > class Identity,
+			typename Enabled = void
+		>
 		class CopyOrApplyWithIdentity;
 
 		/* The cast-and-assign version */
-		template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity >
-		class CopyOrApplyWithIdentity< identity_left, OutputType, InputType, Identity, typename std::enable_if< std::is_convertible< InputType, OutputType >::value >::type > {
-		public:
-			template< typename Operator >
-			static void set( OutputType & out, const InputType & in, const Operator & ) {
-				out = static_cast< OutputType >( in );
-			}
+		template<
+			bool identity_left,
+			typename OutputType, typename InputType,
+			template< typename > class Identity
+		>
+		class CopyOrApplyWithIdentity<
+			identity_left,
+			OutputType, InputType,
+			Identity,
+			typename std::enable_if<
+				std::is_convertible< InputType, OutputType >::value
+			>::type
+		> {
+
+			public:
+
+				template< typename Operator >
+				static void set( OutputType &out, const InputType &in, const Operator & ) {
+					out = static_cast< OutputType >( in );
+				}
+
 		};
 
 		/* The operator with identity version */
-		template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity >
-		class CopyOrApplyWithIdentity< identity_left, OutputType, InputType, Identity, typename std::enable_if< ! std::is_convertible< InputType, OutputType >::value >::type > {
-		public:
-			template< typename Operator >
-			static void set( OutputType & out, const InputType & in, const Operator & op ) {
-				const auto identity = identity_left ? Identity< typename Operator::D1 >::value() : Identity< typename Operator::D2 >::value();
-				if( identity_left ) {
-					(void)grb::apply( out, identity, in, op );
-				} else {
-					(void)grb::apply( out, in, identity, op );
+		template<
+			bool identity_left,
+			typename OutputType, typename InputType,
+			template< typename > class Identity
+		>
+		class CopyOrApplyWithIdentity<
+			identity_left,
+			OutputType, InputType,
+			Identity,
+			typename std::enable_if<
+				!std::is_convertible< InputType, OutputType >::value
+			>::type
+		> {
+
+			public:
+
+				template< typename Operator >
+				static void set(
+					OutputType &out, const InputType &in, const Operator &op
+				) {
+					const auto identity = identity_left ?
+						Identity< typename Operator::D1 >::value() :
+						Identity< typename Operator::D2 >::value();
+					if( identity_left ) {
+						(void) grb::apply( out, identity, in, op );
+					} else {
+						(void) grb::apply( out, in, identity, op );
+					}
 				}
-			}
+
 		};
 
 	} // namespace internal
@@ -507,3 +611,4 @@ namespace grb {
 #undef NO_CAST_ASSERT
 
 #endif // end ``_H_GRB_BLAS0''
+
diff --git a/include/graphblas/blas1.hpp b/include/graphblas/blas1.hpp
index 9b796bee7..e28c9e8ad 100644
--- a/include/graphblas/blas1.hpp
+++ b/include/graphblas/blas1.hpp
@@ -28,6 +28,12 @@
 #ifdef _GRB_WITH_REFERENCE
  #include <graphblas/reference/blas1.hpp>
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/blas1.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas1.hpp"
+#endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/blas1.hpp>
 #endif
diff --git a/include/graphblas/blas2.hpp b/include/graphblas/blas2.hpp
index e44d311a1..2a0b1338e 100644
--- a/include/graphblas/blas2.hpp
+++ b/include/graphblas/blas2.hpp
@@ -33,6 +33,12 @@
 #ifdef _GRB_WITH_REFERENCE
  #include <graphblas/reference/blas2.hpp>
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/blas2.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas2.hpp"
+#endif
 #ifdef _GRB_WITH_BANSHEE
  #include <graphblas/banshee/blas2.hpp>
 #endif
diff --git a/include/graphblas/blas3.hpp b/include/graphblas/blas3.hpp
index 3b485851f..6ed90264b 100644
--- a/include/graphblas/blas3.hpp
+++ b/include/graphblas/blas3.hpp
@@ -28,10 +28,17 @@
 
 // now include all specialisations contained in the backend directories:
 #ifdef _GRB_WITH_REFERENCE
-#include <graphblas/reference/blas3.hpp>
+ #include <graphblas/reference/blas3.hpp>
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/blas3.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas3.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include <graphblas/bsp1d/blas3.hpp>
+ #include <graphblas/bsp1d/blas3.hpp>
 #endif
 
 #endif // end _H_GRB_BLAS3
+
diff --git a/include/graphblas/bsp/collectives.hpp b/include/graphblas/bsp/collectives.hpp
index 6c1e28db3..098f7f738 100644
--- a/include/graphblas/bsp/collectives.hpp
+++ b/include/graphblas/bsp/collectives.hpp
@@ -98,14 +98,16 @@ namespace grb {
 		 * This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes
 		 * if the internal buffer was not sufficiently large.
 		 */
-		template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename Operator, typename IOType
+		>
 		static RC allreduce( IOType &inout, const Operator &op = Operator() ) {
 			// this is the serial algorithm only
 			// TODO internal issue #19
 #ifdef _DEBUG
-			std::cout << "Entered grb::collectives< BSP1D >::allreduce with "
-						 "inout = "
-					  << inout << " and op = " << &op << std::endl;
+			std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = "
+				<< inout << " and op = " << &op << std::endl;
 #endif
 
 			// static sanity check
@@ -303,6 +305,17 @@ namespace grb {
 		 *                      On output at non-root processes: the value at root.
 		 *
 		 * \parblock
+		 * \par Performance semantics: common
+		 * Whether system calls will happen depends on the LPF engine compiled with,
+		 * as does whether buffer space is proportional to the payload size is
+		 * required. In principle, when using a fabric like Inifiband and when using
+		 * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained.
+		 *
+		 * All below variants in any backend shall not result in dynamic memory
+		 * allocations.
+		 * \endparblock
+		 *
+		 * \parblock
 		 * \par Performance semantics: serial
 		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
 		 * -# local work: \f$ 0 \f$ ;
@@ -310,20 +323,21 @@ namespace grb {
 		 * -# BSP cost: \f$ NPg + l \f$;
 		 * \endparblock
 		 *
-		 * \par Performance semantics: two hase
+		 * \parblock
+		 * \par Performance semantics: two phase
 		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
 		 * -# local work: \f$ 0 \f$ ;
 		 * -# transferred bytes: \f$ 2N \f$ ;
 		 * -# BSP cost: \f$ 2(Ng + l) \f$;
 		 * \endparblock
 		 *
+		 * \parblock
 		 * \par Performance semantics: two level tree
 		 * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
 		 * -# local work: \f$ 0 \f$ ;
 		 * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
 		 * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
 		 * \endparblock
-		 *
 		 */
 		template< typename IOType >
 		static RC broadcast( IOType & inout, const lpf_pid_t root = 0 ) {
diff --git a/include/graphblas/bsp/config.hpp b/include/graphblas/bsp/config.hpp
index 907dd659e..5672673e8 100644
--- a/include/graphblas/bsp/config.hpp
+++ b/include/graphblas/bsp/config.hpp
@@ -27,30 +27,38 @@
 
 #include <cstddef>
 
+
 namespace grb {
+
 	namespace config {
 
 		/**
 		 * Lightweight Parallel Foundations defaults.
 		 */
 		class LPF {
-		public:
-			/**
-			 * Return the default number of memory registrations used by GraphBLAS.
-			 */
-			static constexpr size_t regs() {
-				return 500;
-			}
-
-			/**
-			 * Return the default maximum h relation expressed in the number of messages
-			 * (instead of bytes) used by GraphBLAS.
-			 */
-			static constexpr size_t maxh() {
-				return 200;
-			}
+
+			public:
+
+				/**
+				 * Return the default number of memory registrations used by GraphBLAS.
+				 */
+				static constexpr size_t regs() {
+					return 500;
+				}
+
+				/**
+				 * Return the default maximum h relation expressed in the number of messages
+				 * (instead of bytes) used by GraphBLAS.
+				 */
+				static constexpr size_t maxh() {
+					return 200;
+				}
+
 		};
+
 	} // namespace config
+
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/bsp1d/benchmark.hpp b/include/graphblas/bsp1d/benchmark.hpp
index c88e83f67..2fdb91ac5 100644
--- a/include/graphblas/bsp1d/benchmark.hpp
+++ b/include/graphblas/bsp1d/benchmark.hpp
@@ -31,6 +31,7 @@
 
 #include "exec.hpp"
 
+
 namespace grb {
 
 	namespace internal {
@@ -332,15 +333,23 @@ namespace grb {
 
 		public:
 
-			Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) : Launcher< FROM_MPI, BSP1D >( comm ) {}
+			Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) :
+				Launcher< FROM_MPI, BSP1D >( comm )
+			{}
 
 			template< typename U >
-			RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
 				const void * data_in, const size_t in_size,
 				U &data_out,
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) const {
+				// check arguments
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
+
 				// prepare packed input
 				struct internal::packedBenchmarkerInput input;
 				input.blob = data_in;
@@ -354,7 +363,8 @@ namespace grb {
 				lpf_args_t args;
 				fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > );
 				fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
-				args = { &input, sizeof( struct internal::packedBenchmarkerInput ),
+				args = {
+					&input, sizeof( struct internal::packedBenchmarkerInput ),
 					&data_out, sizeof( U ),
 					fargs, 2
 				};
@@ -373,8 +383,9 @@ namespace grb {
 			}
 
 			template< typename T, typename U >
-			RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T & data_in, U &data_out,           // input & output data
+			RC exec(
+				void ( *grb_program )( const T &, U & ), // user program
+				const T &data_in, U &data_out,           // input & output data
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) {
@@ -420,7 +431,8 @@ namespace grb {
 
 		public:
 
-			Benchmarker( const size_t process_id = 0,         // user process ID
+			Benchmarker(
+				const size_t process_id = 0,              // user process ID
 				const size_t nprocs = 1,                  // total number of user processes
 				const std::string hostname = "localhost", // one of the process' hostnames
 				const std::string port = "0",             // a free port at hostname
@@ -430,12 +442,18 @@ namespace grb {
 			) {}
 
 			template< typename U >
-			enum RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+			enum RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
 				const void * data_in, const size_t in_size,
 				U &data_out,
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) const {
+				// check input arguments
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
+
 				// prepare packed input
 				struct internal::packedBenchmarkerInput input;
 				input.blob = data_in;
diff --git a/include/graphblas/bsp1d/blas1.hpp b/include/graphblas/bsp1d/blas1.hpp
index 51d25a96e..7455a4679 100644
--- a/include/graphblas/bsp1d/blas1.hpp
+++ b/include/graphblas/bsp1d/blas1.hpp
@@ -289,6 +289,52 @@ namespace grb {
 		return foldl< descr >( x, y, empty_mask, monoid );
 	}
 
+	/** No implementation notes. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename IOType, typename Coords, typename InputType
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, BSP1D, Coords > &y,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Operator::D1 >::value ), "grb::foldl",
+			"called with an input vector value type that does not match the first "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Operator::D2 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the second domain of "
+			"the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Operator::D3 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the third domain of "
+			"the given operator" );
+
+		// dynamic checks
+		const size_t n = size( y );
+		if( (descr & descriptors::dense) ) {
+			if( nnz( y ) < n ) {
+				return ILLEGAL;
+			}
+		}
+
+		// nonzero structure remains unchanged, so just dispatch
+		RC ret = foldr< descr >( alpha, internal::getLocal( y ), op, phase );
+		assert( ret == SUCCESS );
+		if( ret != SUCCESS ) {
+			ret = PANIC;
+		}
+		return ret;
+	}
+
 	/** \internal No implementation notes. */
 	template<
 		Descriptor descr = descriptors::no_operation, class Monoid,
@@ -329,6 +375,7 @@ namespace grb {
 		) {
 			return SUCCESS;
 		}
+
 		// simply delegate to reference implementation will yield correct result
 		RC ret = foldr< descr >( alpha, internal::getLocal( y ), monoid, phase );
 		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
@@ -347,7 +394,7 @@ namespace grb {
 				ret == FAILED
 			) {
 				const RC subrc = internal::updateNnz( y );
-				if( subrc != SUCCESS ) { ret = FAILED; }
+				if( subrc != SUCCESS ) { ret = PANIC; }
 			}
 		}
 
@@ -373,7 +420,7 @@ namespace grb {
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< IOType, typename Operator::D2 >::value ), "grb::foldr",
 			"called with an I/O value type that does not match the second domain of "
-			"the given operator " );
+			"the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType, typename Operator::D1 >::value ), "grb::foldr",
 			"called with an input vector value type that does not match the first "
@@ -429,7 +476,7 @@ namespace grb {
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
 			"called with an I/O value type that does not match the first domain of "
-			"the given operator " );
+			"the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
 			"called with an input vector value type that does not match the second "
@@ -440,9 +487,10 @@ namespace grb {
 			"the given operator" );
 
 		// dynamic checks
-		if( nnz( x ) < size( x ) ) {
-			// note: this illegal no matter whether the dense descriptor is given
-			return ILLEGAL;
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < size( x ) ) {
+				return ILLEGAL;
+			}
 		}
 
 		// nonzero structure remains unchanged, so just dispatch
@@ -456,7 +504,8 @@ namespace grb {
 
 	/** No implementation notes. */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename IOType, typename Coords, typename InputType
 	>
 	RC foldl(
@@ -520,6 +569,166 @@ namespace grb {
 		return ret;
 	}
 
+	/** No implementation notes. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename IOType, typename MaskType,
+		typename Coords, typename InputType
+	>
+	RC foldl(
+		Vector< IOType, BSP1D, Coords > &x,
+		Vector< MaskType, BSP1D, Coords > &mask,
+		const InputType &beta,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< InputType >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the first domain of "
+			"the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
+			"called with an input vector value type that does not match the second "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Operator::D3 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the third domain of "
+			"the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ), "grb::foldl",
+			"called with a mask value type that is not Boolean" );
+
+		// check trivial dispatch
+		if( size( mask ) == 0 ) {
+			return foldl< descr >( x, beta, op, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( size( mask ) != n ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) ) {
+			if( nnz( x ) < n ) {
+				return ILLEGAL;
+			}
+			if( nnz( mask ) < n ) {
+				return ILLEGAL;
+			}
+		}
+
+		// nonzero structure remains unchanged, so just dispatch
+		RC ret = foldl< descr >( internal::getLocal( x ), internal::getLocal( mask ),
+			beta, op, phase );
+		assert( ret == SUCCESS );
+		if( ret != SUCCESS ) {
+			ret = PANIC;
+		}
+		return ret;
+	}
+
+	/** No implementation notes. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType, typename MaskType,
+		typename Coords, typename InputType
+	>
+	RC foldl(
+		Vector< IOType, BSP1D, Coords > &x,
+		Vector< MaskType, BSP1D, Coords > &mask,
+		const InputType &beta,
+		const Monoid &monoid,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Monoid::D1 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the first domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl",
+			"called with an input vector value type that does not match the second "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, typename Monoid::D3 >::value ), "grb::foldl",
+			"called with an I/O value type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ), "grb::foldl",
+			"called with a mask value type that is not Boolean" );
+
+		// check trivial dispatch
+		if( size( mask ) == 0 ) {
+			return foldl< descr >( x, beta, monoid, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( size( mask ) != n ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < n ) {
+				return ILLEGAL;
+			}
+			if( nnz( mask ) < n ) {
+				return ILLEGAL;
+			}
+		}
+
+		// check for trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = foldl< descr >( internal::getLocal( x ), internal::getLocal( mask ),
+			beta, monoid, phase );
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle try and execute
+		if( phase != RESIZE ) {
+			assert( phase == EXECUTE || phase == TRY );
+			if( ret == SUCCESS ) {
+				if( nnz( mask ) == n &&
+					(descr & descriptors::structural) &&
+					!(descr & descriptors::invert_mask)
+				) {
+					internal::setDense( x );
+				} else if( nnz( mask ) == 0 && (descr & descriptors::invert_mask) ) {
+					internal::setDense( x );
+				} else {
+					const RC subrc = internal::updateNnz( x );
+					if( subrc != SUCCESS ) { ret = PANIC; }
+				}
+			} else if( ret == FAILED ) {
+				assert( phase == TRY );
+				const RC subrc = internal::updateNnz( x );
+				if( subrc != SUCCESS ) { ret = PANIC; }
+			}
+		}
+
+		// done
+		return ret;
+	}
+
 	/**
 	 * \internal Number of nonzeroes in \a x cannot change, hence no
 	 * synchronisation required.
@@ -542,7 +751,7 @@ namespace grb {
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
 			"called with an I/O value type that does not match the first domain of "
-			"the given operator " );
+			"the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
 			"called with an input vector value type that does not match the second "
@@ -657,57 +866,57 @@ namespace grb {
 		return ret;
 	}
 
-	/** \internal No communication necessary, output is guaranteed dense. */
+	/** \internal No implementation notes */
 	template<
-		Descriptor descr = descriptors::no_operation,
-		class Operator,
-		typename OutputType, typename InputType1, typename InputType2,
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename IOType, typename MaskType, typename InputType,
 		typename Coords
 	>
-	RC eWiseApply(
-		Vector< OutputType, BSP1D, Coords > &z,
-		const Vector< InputType1, BSP1D, Coords > &x,
-		const InputType2 beta,
-		const Operator &op,
+	RC foldl(
+		Vector< IOType, BSP1D, Coords > &x,
+		const Vector< MaskType, BSP1D, Coords > &m,
+		const Vector< InputType, BSP1D, Coords > &y,
+		const OP &op = OP(),
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
-			!grb::is_object< InputType1 >::value &&
-			!grb::is_object< InputType2 >::value &&
-			grb::is_operator< Operator >::value, void
-		>::type * const = nullptr
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
 	) {
-#ifdef _DEBUG
-		std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
-			"[T1]<-[T2]<-T3\n";
-#endif
-
-		// static checks
+		// static sanity checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< InputType1, typename Operator::D1 >::value ),
-			"grb::eWiseApply",
-			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+				std::is_same< typename OP::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< InputType2, typename Operator::D2 >::value ),
-			"grb::eWiseApply",
-			"called with a right-hand input vector value type that does not match the second "
-			"domain of the given operator" );
+				std::is_same< typename OP::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< OutputType, typename Operator::D3 >::value ),
-			"grb::eWiseApply",
-			"called with an output value type that does not match the third domain of "
-			"the given operator" );
+				std::is_same< typename OP::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::foldl",
+			"called with a mask that does not have boolean entries " );
 
-		// dynamic checks
-		const size_t n = size( z );
-		if( size( x ) != n ) {
-			return MISMATCH;
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, op, phase );
 		}
-		if( nnz( x ) < n ) {
-			return ILLEGAL;
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
 		}
 
-		// catch trivial resize
+		// handle trivial resize phase
 		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
 			phase == RESIZE
 		) {
@@ -715,8 +924,12 @@ namespace grb {
 		}
 
 		// delegate
-		RC ret = eWiseApply< descr >( internal::getLocal( z ),
-			internal::getLocal( x ), beta, op, phase );
+		RC ret = foldl< descr >(
+			internal::getLocal( x ), internal::getLocal( m ),
+			internal::getLocal( y ),
+			op, phase
+		);
+
 		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
 			if( collectives< BSP1D >::allreduce(
 				ret, grb::operators::any_or< RC >()
@@ -725,20 +938,12 @@ namespace grb {
 			}
 		}
 
-		// handle try and execute
-		if( phase == TRY ) {
-			if( ret == SUCCESS || ret == FAILED ) {
-				const RC subrc = internal::updateNnz( z );
-				if( subrc != SUCCESS ) {
-					if( ret == SUCCESS ) { ret = subrc; }
-					else { ret = PANIC; }
-				}
-			}
-		} else if( phase == EXECUTE ) {
+		// handle try and execute phases
+		if( phase != RESIZE ) {
 			if( ret == SUCCESS ) {
-				internal::setDense( z );
+				ret = internal::updateNnz( x );
 			} else if( ret == FAILED ) {
-				const RC subrc = internal::updateNnz( z );
+				const RC subrc = internal::updateNnz( x );
 				if( subrc != SUCCESS ) { ret = PANIC; }
 			}
 		}
@@ -747,7 +952,473 @@ namespace grb {
 		return ret;
 	}
 
-	/** \internal No communication necessary, output is guaranteed dense. */
+	/** \internal No implementation notes */
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename IOType, typename MaskType, typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, BSP1D, Coords > &x,
+		const Vector< MaskType, BSP1D, Coords > &m,
+		const Vector< InputType, BSP1D, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::foldl",
+			"called with a mask that does not have boolean entries" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, monoid, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		// handle trivial resize phase
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = foldl< descr >(
+			internal::getLocal( x ), internal::getLocal( m ),
+			internal::getLocal( y ),
+			monoid, phase
+		);
+
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle try and execute phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				ret = internal::updateNnz( x );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( x );
+				if( subrc != SUCCESS ) { ret = PANIC; }
+			}
+		}
+
+		// done
+		return ret;
+	}
+
+	/** \internal No communication necessary, output is guaranteed dense. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
+			"[T1]<-T2<-T3\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType1, typename Operator::D1 >::value ),
+			"grb::eWiseApply",
+			"called with a left-hand input scalar type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, typename Operator::D2 >::value ),
+			"grb::eWiseApply",
+			"called with a right-hand input scalar type that does not match the second "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, typename Operator::D3 >::value ),
+			"grb::eWiseApply",
+			"called with an output value type that does not match the third domain of "
+			"the given operator" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( (descr & descriptors::dense) && nnz( z ) != n ) {
+			return ILLEGAL;
+		}
+		if( capacity( z ) < n && phase == EXECUTE ) {
+			return FAILED;
+		}
+
+		// catch trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate to set
+		OutputType temp;
+		RC ret = apply< descr >( temp, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, temp, phase );
+
+		// done
+		return ret;
+	}
+
+	/** \internal Delegates to masked set. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "In BSP1D masked eWiseApply (operator-based), "
+			"[T1]<-T2<-T3\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType1, typename Operator::D1 >::value ),
+			"grb::eWiseApply",
+			"called with a left-hand input scalar type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, typename Operator::D2 >::value ),
+			"grb::eWiseApply",
+			"called with a right-hand input scalar type that does not match the second "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, typename Operator::D3 >::value ),
+			"grb::eWiseApply",
+			"called with an output value type that does not match the third domain of "
+			"the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask value type that is not bool" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( (descr & descriptors::dense) && nnz( mask ) != n ) {
+			return ILLEGAL;
+		}
+		if( (descr & descriptors::dense) && nnz( z ) != n ) {
+			return ILLEGAL;
+		}
+		if( size( mask ) != n ) {
+			return MISMATCH;
+		}
+		if( capacity( z ) < n && phase == EXECUTE ) {
+			return FAILED;
+		}
+
+		// catch trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate to set
+		OutputType temp;
+		RC ret = apply< descr >( temp, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, mask, temp, phase );
+
+		// done
+		return ret;
+	}
+
+	/** \internal No communication necessary, output is guaranteed dense. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "In BSP1D unmasked eWiseApply (monoid-based), "
+			"[T1]<-T2<-T3\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType1, typename Monoid::D1 >::value ),
+			"grb::eWiseApply",
+			"called with a left-hand input scalar type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, typename Monoid::D2 >::value ),
+			"grb::eWiseApply",
+			"called with a right-hand input scalar type that does not match the second "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, typename Monoid::D3 >::value ),
+			"grb::eWiseApply",
+			"called with an output value type that does not match the third domain of "
+			"the given monoid" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( (descr & descriptors::dense) && nnz( z ) != n ) {
+			return ILLEGAL;
+		}
+		if( capacity( z ) < n && phase == EXECUTE ) {
+			return FAILED;
+		}
+
+		// catch trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate to set
+		OutputType temp;
+		RC ret = apply< descr >( temp, alpha, beta, monoid.getOperator() );
+		ret = ret ? ret : set< descr >( z, temp, phase );
+
+		// done
+		return ret;
+	}
+
+	/** \internal Delegates to masked set. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "In BSP1D masked eWiseApply (monoid-based), "
+			"[T1]<-T2<-T3\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType1, typename Monoid::D1 >::value ),
+			"grb::eWiseApply",
+			"called with a left-hand input scalar type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, typename Monoid::D2 >::value ),
+			"grb::eWiseApply",
+			"called with a right-hand input scalar type that does not match the second "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, typename Monoid::D3 >::value ),
+			"grb::eWiseApply",
+			"called with an output value type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask value type that is not bool" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( (descr & descriptors::dense) && nnz( mask ) != n ) {
+			return ILLEGAL;
+		}
+		if( (descr & descriptors::dense) && nnz( z ) != n ) {
+			return ILLEGAL;
+		}
+		if( size( mask ) != n ) {
+			return MISMATCH;
+		}
+		if( capacity( z ) < n && phase == EXECUTE ) {
+			return FAILED;
+		}
+
+		// catch trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate to set
+		OutputType temp;
+		RC ret = apply< descr >( temp, alpha, beta, monoid.getOperator() );
+		ret = ret ? ret : set< descr >( z, mask, temp, phase );
+
+		// done
+		return ret;
+	}
+
+	/** \internal No communication necessary, output is guaranteed dense. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Operator,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< InputType1, BSP1D, Coords > &x,
+		const InputType2 beta,
+		const Operator &op,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
+			"[T1]<-[T2]<-T3\n";
+#endif
+
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType1, typename Operator::D1 >::value ),
+			"grb::eWiseApply",
+			"called with a left-hand input vector value type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, typename Operator::D2 >::value ),
+			"grb::eWiseApply",
+			"called with a right-hand input vector value type that does not match the second "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, typename Operator::D3 >::value ),
+			"grb::eWiseApply",
+			"called with an output value type that does not match the third domain of "
+			"the given operator" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+		if( nnz( x ) < n ) {
+			return ILLEGAL;
+		}
+
+		// catch trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = eWiseApply< descr >( internal::getLocal( z ),
+			internal::getLocal( x ), beta, op, phase );
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle try and execute
+		if( phase == TRY ) {
+			if( ret == SUCCESS || ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) {
+					if( ret == SUCCESS ) { ret = subrc; }
+					else { ret = PANIC; }
+				}
+			}
+		} else if( phase == EXECUTE ) {
+			if( ret == SUCCESS ) {
+				internal::setDense( z );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) { ret = PANIC; }
+			}
+		}
+
+		// done
+		return ret;
+	}
+
+	/** \internal No communication necessary, output is guaranteed dense. */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		class Operator,
@@ -760,22 +1431,22 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Operator &op,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_operator< Operator >::value,
 		void >::type * const = nullptr
 	) {
 #ifdef _DEBUG
-		std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
-					 "[T1]<-T2<-[T3]\n";
+		std::cerr << "In BSP1D unmasked eWiseApply (operator-based), T1]<-T2<-[T3]\n";
 #endif
 		// static checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType1, typename Operator::D1 >::value ),
 			"grb::eWiseApply",
 			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+			"first domain of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType2, typename Operator::D2 >::value ),
 			"grb::eWiseApply",
@@ -852,7 +1523,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Operator &op,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_operator< Operator >::value,
@@ -867,7 +1539,7 @@ namespace grb {
 			std::is_same< InputType1, typename Operator::D1 >::value ),
 			"grb::eWiseApply",
 			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+			"first domain of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType2, typename Operator::D2 >::value ),
 			"grb::eWiseApply",
@@ -966,7 +1638,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Operator &op,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -982,7 +1655,7 @@ namespace grb {
 			std::is_same< InputType1, typename Operator::D1 >::value ),
 			"grb::eWiseApply",
 			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+			"first domain of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType2, typename Operator::D2 >::value ),
 			"grb::eWiseApply",
@@ -1066,7 +1739,8 @@ namespace grb {
 		const InputType2 beta,
 		const Operator &op,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -1082,7 +1756,7 @@ namespace grb {
 			std::is_same< InputType1, typename Operator::D1 >::value ),
 			"grb::eWiseApply",
 			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+			"first domain of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType2, typename Operator::D2 >::value ),
 			"grb::eWiseApply",
@@ -1167,7 +1841,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Operator &op,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -1183,7 +1858,7 @@ namespace grb {
 			std::is_same< InputType1, typename Operator::D1 >::value ),
 			"grb::eWiseApply",
 			"called with a left-hand input vector value type that does not match the "
-			"first domain of the given operator " );
+			"first domain of the given operator" );
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< InputType2, typename Operator::D2 >::value ),
 			"grb::eWiseApply",
@@ -1276,7 +1951,8 @@ namespace grb {
 		const InputType2 beta,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_monoid< Monoid >::value,
@@ -1305,7 +1981,9 @@ namespace grb {
 
 		// check if can delegate to dense variant
 		const size_t n = size( z );
-		if( (descr & descriptors::dense) || nnz( x ) == n ) {
+		if( (descr & descriptors::dense) || (
+			nnz( x ) == n && nnz( z ) == n
+		) ) {
 			return eWiseApply< descr | descriptors::dense >(
 				z, x, beta, monoid.getOperator(), phase
 			);
@@ -1316,7 +1994,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( x ) < n ) {
+			if( nnz( x ) < n || nnz( z ) < n ) {
 				return ILLEGAL;
 			}
 		}
@@ -1365,7 +2043,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_monoid< Monoid >::value,
@@ -1394,7 +2073,9 @@ namespace grb {
 
 		// check if can delegate to dense variant
 		const size_t n = size( z );
-		if( (descr & descriptors::dense) || nnz( y ) == n ) {
+		if( (descr & descriptors::dense) || (
+			nnz( y ) == n && nnz( z ) == n
+		) ) {
 			return eWiseApply< descr | descriptors::dense >(
 				z, alpha, y, monoid.getOperator(), phase
 			);
@@ -1405,7 +2086,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < n ) {
+			if( nnz( y ) < n || nnz( z ) < n ) {
 				return ILLEGAL;
 			}
 		}
@@ -1456,7 +2137,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_monoid< Monoid >::value,
@@ -1485,7 +2167,9 @@ namespace grb {
 
 		// check if we can delegate to dense variant
 		const size_t n = size( z );
-		if( (descr & descriptors::dense) || (nnz( x ) == n && nnz( y ) == n) ) {
+		if( (descr & descriptors::dense) || (
+			nnz( x ) == n && nnz( y ) == n && nnz( z ) == n
+		) ) {
 			return eWiseApply< descr | descriptors::dense >(
 				z, x, y, monoid.getOperator(), phase
 			);
@@ -1556,7 +2240,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -1601,7 +2286,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < n || nnz( mask ) < n ) {
+			if( nnz( y ) < n || nnz( mask ) < n || nnz( z ) < n ) {
 				return ILLEGAL;
 			}
 		}
@@ -1656,7 +2341,8 @@ namespace grb {
 		const InputType2 beta,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -1707,6 +2393,9 @@ namespace grb {
 			if( nnz( x ) < n ) {
 				return ILLEGAL;
 			}
+			if( nnz ( z ) < n ) {
+				return ILLEGAL;
+			}
 		}
 
 		// handle trivial resize phase
@@ -1759,7 +2448,8 @@ namespace grb {
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Monoid &monoid,
 		const Phase &phase = EXECUTE,
-		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
@@ -1807,7 +2497,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( x ) < n || nnz( y ) < n ) {
+			if( nnz( x ) < n || nnz( y ) < n || nnz( z ) < n ) {
 				return ILLEGAL;
 			}
 			if( nnz( mask ) < n ) {
@@ -2220,94 +2910,406 @@ namespace grb {
 		);
 	}
 
-	/**
-	 * \internal Does not require communication.
-	 *
-	 * \warning This function has been deprecated since version 0.5. If required,
-	 *          consider instead a sequence of grb::foldl using the additive
-	 *          monoid, followed by a call to grb::eWiseMul.
-	 */
+	/**
+	 * \internal Does not require communication.
+	 *
+	 * \warning This function has been deprecated since version 0.5. If required,
+	 *          consider instead a sequence of grb::foldl using the additive
+	 *          monoid, followed by a call to grb::eWiseMul.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, BSP1D, Coords > & y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const size_t n = grb::size( z );
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+			return SUCCESS;
+		}
+		if( phase == RESIZE ) {
+			return resize( z, n );
+		}
+
+		assert( phase == EXECUTE );
+		internal::setDense( z );
+		return grb::eWiseMulAdd< descr >(
+			internal::getLocal( z ), alpha, beta, internal::getLocal( y ), ring
+		);
+	}
+
+	/**
+	 * \internal Does not require communication.
+	 *
+	 * \warning This function has been deprecated since version 0.5. If required,
+	 *          consider instead a sequence of grb::foldl using the additive
+	 *          monoid, followed by a call to grb::eWiseMul.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+			return SUCCESS;
+		}
+		if( phase == RESIZE ) {
+			return resize( z, size( z ) );
+		}
+		assert( phase == EXECUTE );
+		internal::setDense( z );
+		return grb::eWiseMulAdd< descr >( internal::getLocal( z ), alpha, beta,
+			gamma, ring );
+	}
+
+	/** \internal Requires syncing of output nonzero count. */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< InputType1, BSP1D, Coords > &x,
+		const Vector< InputType2, BSP1D, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = grb::size( z );
+		if( n != grb::size( x ) ) {
+			return MISMATCH;
+		}
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n || nnz( x ) < n || nnz( y ) < n ) {
+				return ILLEGAL;
+			}
+		}
+
+		// handle trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = eWiseMul< descr >(
+			internal::getLocal( z ),
+			internal::getLocal( x ), internal::getLocal( y ),
+			ring, phase
+		);
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle try and execute phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				ret = internal::updateNnz( z );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) { ret = PANIC; }
+			}
+		}
+
+		// done
+		return ret;
+	}
+
+	/** \internal Requires syncing of output nonzero count. */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, BSP1D, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = grb::size( z );
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n || nnz( y ) < n ) {
+				return ILLEGAL;
+			}
+		}
+
+		// handle trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = eWiseMul< descr >( internal::getLocal( z ), alpha,
+			internal::getLocal( y ), ring, phase );
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle execute and try phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				ret = internal::updateNnz( z );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) { ret = PANIC; }
+			}
+		}
+
+		// done
+		return ret;
+	}
+
+	/** \internal Requires syncing of output nonzero count. */
 	template<
 		Descriptor descr = descriptors::no_operation, class Ring,
-		typename InputType1, typename InputType2, typename InputType3,
-		typename OutputType, typename Coords
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
 	>
-	RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
-		const InputType1 alpha,
+	RC eWiseMul(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< InputType1, BSP1D, Coords > &x,
 		const InputType2 beta,
-		const Vector< InputType3, BSP1D, Coords > & y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
-			!grb::is_object< InputType3 >::value &&
 			grb::is_semiring< Ring >::value, void
 		>::type * const = nullptr
 	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
 		const size_t n = grb::size( z );
-		if( n != grb::size( y ) ) {
+		if( n != grb::size( x ) ) {
 			return MISMATCH;
 		}
-		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+		}
+
+		// handle trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
 			return SUCCESS;
 		}
-		if( phase == RESIZE ) {
-			return resize( z, n );
+
+		// delegate
+		RC ret = eWiseMul< descr >( internal::getLocal( z ),
+			internal::getLocal( x ), beta, ring, phase );
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
 		}
 
-		assert( phase == EXECUTE );
-		internal::setDense( z );
-		return grb::eWiseMulAdd< descr >(
-			internal::getLocal( z ), alpha, beta, internal::getLocal( y ), ring
-		);
+		// handle try and execute phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				ret = internal::updateNnz( z );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) { ret = FAILED; }
+			}
+		}
+
+		// done
+		return ret;
 	}
 
-	/**
-	 * \internal Does not require communication.
-	 *
-	 * \warning This function has been deprecated since version 0.5. If required,
-	 *          consider instead a sequence of grb::foldl using the additive
-	 *          monoid, followed by a call to grb::eWiseMul.
-	 */
+	/** \internal no implementation details */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
-		typename InputType1, typename InputType2, typename InputType3,
-		typename OutputType, typename Coords
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
 	>
-	RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+	RC eWiseMul(
+		Vector< OutputType, BSP1D, Coords > &z,
 		const InputType1 alpha,
 		const InputType2 beta,
-		const InputType3 gamma,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
-			!grb::is_object< InputType3 >::value &&
-			grb::is_semiring< Ring >::value,
-		void >::type * const = nullptr
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
 	) {
-		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = grb::size( z );
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+		}
+
+		// handle trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
 			return SUCCESS;
 		}
-		if( phase == RESIZE ) {
-			return resize( z, size( z ) );
+
+		// delegate
+		RC ret = eWiseMul< descr >( internal::getLocal( z ),
+			alpha, beta, ring, phase );
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
 		}
-		assert( phase == EXECUTE );
-		internal::setDense( z );
-		return grb::eWiseMulAdd< descr >( internal::getLocal( z ), alpha, beta,
-			gamma, ring );
+
+		// handle try and execute phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				internal::setDense( z );
+			}
+		}
+
+		// done
+		return ret;
 	}
 
 	/** \internal Requires syncing of output nonzero count. */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring, typename MaskType,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMul(
 		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &m,
 		const Vector< InputType1, BSP1D, Coords > &x,
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Ring &ring = Ring(),
@@ -2318,8 +3320,28 @@ namespace grb {
 			grb::is_semiring< Ring >::value, void
 		>::type * const = nullptr
 	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+
 		// dynamic checks
 		const size_t n = grb::size( z );
+		if( n != grb::size( m ) ) {
+			return MISMATCH;
+		}
 		if( n != grb::size( x ) ) {
 			return MISMATCH;
 		}
@@ -2327,7 +3349,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( x ) < n || nnz( y ) < n ) {
+			if( nnz( z ) < n || nnz( m ) < n || nnz( x ) < n || nnz( y ) < n ) {
 				return ILLEGAL;
 			}
 		}
@@ -2341,7 +3363,7 @@ namespace grb {
 
 		// delegate
 		RC ret = eWiseMul< descr >(
-			internal::getLocal( z ),
+			internal::getLocal( z ), internal::getLocal( m ),
 			internal::getLocal( x ), internal::getLocal( y ),
 			ring, phase
 		);
@@ -2369,12 +3391,14 @@ namespace grb {
 
 	/** \internal Requires syncing of output nonzero count. */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring, typename MaskType,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMul(
 		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &m,
 		const InputType1 alpha,
 		const Vector< InputType2, BSP1D, Coords > &y,
 		const Ring &ring = Ring(),
@@ -2385,13 +3409,39 @@ namespace grb {
 			grb::is_semiring< Ring >::value, void
 		>::type * const = nullptr
 	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, y, ring, phase );
+		}
+
 		// dynamic checks
-		const size_t n = grb::size( z );
-		if( n != grb::size( y ) ) {
+		const size_t n = size( z );
+		if( n != size( m ) || n != size( y ) ) {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < n ) {
+			if( nnz( z ) < n || nnz( y ) < n ) {
 				return ILLEGAL;
 			}
 		}
@@ -2404,8 +3454,11 @@ namespace grb {
 		}
 
 		// delegate
-		RC ret = eWiseMul< descr >( internal::getLocal( z ), alpha,
-			internal::getLocal( y ), ring, phase );
+		RC ret = eWiseMul< descr >(
+			internal::getLocal( z ), internal::getLocal( m ),
+			alpha, internal::getLocal( y ),
+			ring, phase
+		);
 		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
 			if( collectives< BSP1D >::allreduce(
 				ret, grb::operators::any_or< RC >()
@@ -2430,12 +3483,14 @@ namespace grb {
 
 	/** \internal Requires syncing of output nonzero count. */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring, typename MaskType,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMul(
 		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &m,
 		const Vector< InputType1, BSP1D, Coords > &x,
 		const InputType2 beta,
 		const Ring &ring = Ring(),
@@ -2446,17 +3501,132 @@ namespace grb {
 			grb::is_semiring< Ring >::value, void
 		>::type * const = nullptr
 	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, beta, ring, phase );
+		}
+
 		// dynamic checks
-		const size_t n = grb::size( z );
-		if( n != grb::size( x ) ) {
+		const size_t n = size( z );
+		if( n != size( m ) || n != size( x ) ) {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( x ) < n ) {
-				return ILLEGAL;
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+		}
+
+		// handle trivial resize
+		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+			phase == RESIZE
+		) {
+			return SUCCESS;
+		}
+
+		// delegate
+		RC ret = eWiseMul< descr >(
+			internal::getLocal( z ), internal::getLocal( m ),
+			internal::getLocal( x ), beta,
+			ring, phase
+		);
+		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+			if( collectives< BSP1D >::allreduce(
+				ret, grb::operators::any_or< RC >()
+			) != SUCCESS ) {
+				return PANIC;
+			}
+		}
+
+		// handle try and execute phases
+		if( phase != RESIZE ) {
+			if( ret == SUCCESS ) {
+				ret = internal::updateNnz( z );
+			} else if( ret == FAILED ) {
+				const RC subrc = internal::updateNnz( z );
+				if( subrc != SUCCESS ) { ret = FAILED; }
 			}
 		}
 
+		// done
+		return ret;
+	}
+
+	/** \internal Requires syncing of output nonzero count. */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, typename MaskType,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, BSP1D, Coords > &z,
+		const Vector< MaskType, BSP1D, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( n != size( m ) ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
 		// handle trivial resize
 		if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
 			phase == RESIZE
@@ -2465,8 +3635,11 @@ namespace grb {
 		}
 
 		// delegate
-		RC ret = eWiseMul< descr >( internal::getLocal( z ),
-			internal::getLocal( x ), beta, ring, phase );
+		RC ret = eWiseMul< descr >(
+			internal::getLocal( z ), internal::getLocal( m ),
+			alpha, beta,
+			ring, phase
+		);
 		if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
 			if( collectives< BSP1D >::allreduce(
 				ret, grb::operators::any_or< RC >()
diff --git a/include/graphblas/bsp1d/blas2.hpp b/include/graphblas/bsp1d/blas2.hpp
index 7a0124bcc..42c5875d9 100644
--- a/include/graphblas/bsp1d/blas2.hpp
+++ b/include/graphblas/bsp1d/blas2.hpp
@@ -506,6 +506,39 @@ namespace grb {
 		}
 	}
 
+	/** \internal Dispatches to bsp1d_vxm or bsp1d_mxv */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring, typename Coords, typename RIT, typename CIT, typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename InputType3 = bool
+	>
+	RC vxm(
+		Vector< IOType, BSP1D, Coords > &u,
+		const Vector< InputType3, BSP1D, Coords > &u_mask,
+		const Vector< InputType1, BSP1D, Coords > &v,
+		const Matrix< InputType2, BSP1D, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, BSP1D, Coords > empty_mask( 0 );
+		// transpose is delegated to mxv
+		if( descr & descriptors::transpose_matrix ) {
+			return internal::bsp1d_mxv<
+				descr & ~( descriptors::transpose_matrix ), true, false, true
+			>( u, u_mask, A, v, empty_mask, ring, phase );
+		} else {
+			return internal::bsp1d_vxm< descr, true, false, true >(
+				u, u_mask, v, empty_mask, A, ring, phase
+			);
+		}
+	}
+
 	/** \internal Dispatches to bsp1d_vxm or bsp1d_mxv */
 	template<
 		Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/bsp1d/config.hpp b/include/graphblas/bsp1d/config.hpp
index 6134e2b58..12641135b 100644
--- a/include/graphblas/bsp1d/config.hpp
+++ b/include/graphblas/bsp1d/config.hpp
@@ -18,7 +18,7 @@
 /**
  * @file
  *
- * Implements the various grb::config items for the grb::BSP1D backend.
+ * Contains the configuration parameters for the BSP1D backend
  *
  * @author A. N. Yzelman
  * @date 5th of May, 2017
@@ -41,64 +41,95 @@
 
 namespace grb {
 
-	/**
-	 * \defgroup bsp1d The BSP1D backend implementation
-	 *
-	 * Groups all definitions and documentations corresponding to the #BSP1D
-	 * backend.
-	 * @{
-	 */
-
 	namespace config {
 
 		/**
-		 * Defaults for the BSP1D implementation
+		 * \defgroup bsp1dConfig BSP1D backend configuration
+		 * \ingroup config
+		 *
+		 * All configuration parameters for the #BSP1D and #hybrid backends.
+		 *
+		 * @{
+		 */
+
+		/**
+		 * This class collects configuration parameters that are specific to the
+		 * #grb::BSP1D and #grb::hybrid backends.
+		 *
+		 * \note The full set of implementation details are only visible within the
+		 *       developer documentation.
+		 *
+		 * \ingroup bsp1d
 		 */
 		template<>
-		class IMPLEMENTATION< grb::Backend::BSP1D > {
+		class IMPLEMENTATION< BSP1D > {
 
 			private:
 
 				/**
+				 * \internal
 				 * \a true if and only if \a mode was set. By default, value is \a false.
+				 * \endinternal
 				 */
 				static bool set;
 
 				/**
+				 * \internal
 				 * The selected mode. Only set if \a set is \a true.
+				 * \endinternal
 				 */
 				static grb::config::ALLOC_MODE mode;
 
-				/** Attempts to automatically deduce the best value for \a mode. */
+				/**
+				 * \internal
+				 * Attempts to automatically deduce the best value for \a mode.
+				 * \endinternal
+				 */
 				static void deduce() noexcept;
 
 
 			public:
 
 				/**
-				 * For private memory segments, which is the default, simply choose aligned
-				 * allocations.
+				 * @returns The default allocation strategy for private memory segments.
 				 */
 				static constexpr ALLOC_MODE defaultAllocMode() {
 					return grb::config::ALLOC_MODE::ALIGNED;
 				}
 
 				/**
+				 * \internal
 				 * Whether the backend has vector capacities always fixed to their
 				 * defaults.
+				 * \endinternal
 				 */
 				static constexpr bool fixedVectorCapacities() {
 					return IMPLEMENTATION< _GRB_BSP1D_BACKEND >::fixedVectorCapacities();
 				}
 
 				/**
-				 * For the BSP1D backend, a shared memory-segment should use interleaved
-				 * alloc only if is running one process per compute node.
+				 * @returns The default allocation strategy for shared memory regions.
+				 *
+				 * By default, for the BSP1D backend, a shared memory-segment should use
+				 * interleaved alloc only if is running one process per compute node. This
+				 * implies a run-time component to this function, which is why for this
+				 * backend this function is \em not <tt>constexpr</tt>.
+				 *
+				 * \warning This function does assume that the number of processes does not
+				 *          change over the life time of an ALP context.
+				 *
+				 * \note While the above may seem a reasonably safe assumption, the use of
+				 *       the launcher in #MANUAL mode may, in fact, make this a realistic
+				 *       issue that could be encountered. In such cases the deduction should
+				 *       be re-initiated. If you encounter this problem, please report it so
+				 *       that such a fix can be implemented.
 				 */
 				static grb::config::ALLOC_MODE sharedAllocMode() noexcept;
 
 				/**
+				 * \internal
 				 * Select the coordinates backend of the selected process-local backend.
+				 * \endinternal
 				 */
 				static constexpr Backend coordinatesBackend() {
 					return IMPLEMENTATION< _GRB_BSP1D_BACKEND >::coordinatesBackend();
@@ -106,9 +137,9 @@ namespace grb {
 
 		};
 
-	} // namespace config
+		/** @} */
 
-	/** @} */
+	} // namespace config
 
 } // namespace grb
 
diff --git a/include/graphblas/bsp1d/exec.hpp b/include/graphblas/bsp1d/exec.hpp
index b520ada6d..e8e627aa9 100644
--- a/include/graphblas/bsp1d/exec.hpp
+++ b/include/graphblas/bsp1d/exec.hpp
@@ -337,11 +337,17 @@ namespace grb {
 
 			/** No implementation notes. */
 			template< typename U >
-			RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
 				const void * data_in, const size_t in_size,
 				U &data_out,
 				const bool broadcast = false
 			) const {
+				// check input arguments
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
+
 				// prepare args
 				lpf_func_t fargs[ 2 ];
 				lpf_args_t args;
@@ -364,8 +370,9 @@ namespace grb {
 
 			/** No implementation notes. */
 			template< typename T, typename U >
-			RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out,            // input & output data
+			RC exec(
+				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
+				const T &data_in, U &data_out,           // input & output data
 				const bool broadcast = false
 			) {
 				// prepare args
@@ -451,14 +458,13 @@ namespace grb {
 			 * @throws runtime_error    When the requested launcher group
 			 *                          could not be created.
 			 */
-			Launcher( const size_t process_id = 0,            // user process ID
+			Launcher(
+				const size_t process_id = 0,              // user process ID
 				const size_t nprocs = 1,                  // total number of user processes
 				const std::string hostname = "localhost", // one of the process' hostnames
 				const std::string port = "0",             // a free port at hostname
 				const bool is_mpi_inited = false
-			) : _s( process_id ),
-				_P( nprocs ), _hostname( hostname ), _port( port )
-			{
+			) : _s( process_id ), _P( nprocs ), _hostname( hostname ), _port( port ) {
 				// sanity check
 				if( nprocs == 0 ) {
 					throw std::invalid_argument( "Total number of user processes must be "
@@ -556,6 +562,11 @@ namespace grb {
 				U &data_out,
 				const bool broadcast = false
 			) const {
+				// check input arguments
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
+
 				// prepare args
 				lpf_func_t fargs[ 2 ];
 				lpf_args_t args;
@@ -587,8 +598,9 @@ namespace grb {
 
 			/** No implementation notes. */
 			template< typename T, typename U >
-			RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T &data_in, U &data_out,            // input & output data
+			RC exec(
+				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
+				const T &data_in, U &data_out,           // input & output data
 				const bool broadcast = false
 			) {
 				// prepare args
diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp
index 5302783e5..3643ca559 100644
--- a/include/graphblas/bsp1d/io.hpp
+++ b/include/graphblas/bsp1d/io.hpp
@@ -504,6 +504,13 @@ namespace grb {
 	) noexcept {
 		const size_t n = size( x );
 		const size_t old_nnz = nnz( x );
+
+		// dynamic checks
+		if( (descr & descriptors::dense) && nnz( x ) < n ) {
+			return ILLEGAL;
+		}
+
+		// capacity check
 		if( capacity( x ) < n ) {
 			if( phase == RESIZE ) {
 				return resize( x, n );
@@ -517,16 +524,20 @@ namespace grb {
 			}
 		}
 
+		// handle trivial resize
 		assert( capacity( x ) == n );
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 
+		// dispatch
 		assert( phase == EXECUTE );
 		RC ret = internal::set_handle_use_index< descr >( x, old_nnz, val );
 		if( ret == SUCCESS ) {
 			internal::setDense( x );
 		}
+
+		// done
 		return ret;
 	}
 
@@ -622,7 +633,7 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < size( y ) ) {
+			if( nnz( x ) < size( x ) || nnz( y ) < size( y ) ) {
 				return ILLEGAL;
 			}
 		}
@@ -699,7 +710,10 @@ namespace grb {
 			return MISMATCH;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < size( y ) || nnz( mask ) < size( mask ) ) {
+			if( nnz( x ) < size( x ) ||
+				nnz( y ) < size( y ) ||
+				nnz( mask ) < size( mask )
+			) {
 				return ILLEGAL;
 			}
 		}
@@ -765,11 +779,21 @@ namespace grb {
 			return MISMATCH;
 		}
 
+		// dynamic checks
+		if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+			return ILLEGAL;
+		}
+		if( (descr & descriptors::dense) && nnz( mask ) < size( mask ) ) {
+			return ILLEGAL;
+		}
+
 		// on capacity pre-check, see above
 
 		// all OK, try to do assignment
-		RC ret = set< descr >( internal::getLocal( x ),
-			internal::getLocal( mask ), y, phase );
+		RC ret = set< descr >(
+			internal::getLocal( x ),
+			internal::getLocal( mask ), y, phase
+		);
 
 		if( collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() )
 			!= SUCCESS
diff --git a/include/graphblas/bsp1d/properties.hpp b/include/graphblas/bsp1d/properties.hpp
index f87cb1c54..8c28386bf 100644
--- a/include/graphblas/bsp1d/properties.hpp
+++ b/include/graphblas/bsp1d/properties.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Gathers the properties of the BSP1D and hybrid backends.
+ *
  * @author A. N. Yzelman
  * @date 5th of May 2017
  */
@@ -30,11 +34,31 @@ namespace grb {
 	/** No implementation notes. */
 	template<>
 	class Properties< BSP1D > {
-	public:
-		/** No implementation notes. */
-		constexpr static bool writableCaptured = Properties< _GRB_BSP1D_BACKEND >::writableCaptured;
+
+		public:
+
+			/** This property is inherited from the backend it depends on. */
+			static constexpr const bool writableCaptured =
+				Properties< _GRB_BSP1D_BACKEND >::writableCaptured;
+
+			/**
+			 * This implementation at present only supports blocking execution.
+			 */
+			static constexpr const bool isBlockingExecution = true;
+
+			/**
+			 * This implementation at present only supports blocking execution.
+			 */
+			static constexpr const bool isNonblockingExecution = false;
+
+			static_assert( Properties< _GRB_BSP1D_BACKEND >::isBlockingExecution,
+					"This implementation assumes blocking behaviour of the underlying "
+					"process-local backend"
+				);
+
 	};
 
 } // namespace grb
 
 #endif // end ``_H_GRB_BSP1D_PROPERTIES''
+
diff --git a/include/graphblas/bsp1d/vector.hpp b/include/graphblas/bsp1d/vector.hpp
index 465f5a661..1e85db74e 100644
--- a/include/graphblas/bsp1d/vector.hpp
+++ b/include/graphblas/bsp1d/vector.hpp
@@ -618,14 +618,15 @@ namespace grb {
 				const size_t bufferSize =
 					internal::Coordinates< _GRB_BSP1D_BACKEND >::bufferSize( _local_n ) +
 					internal::Coordinates< _GRB_BSP1D_BACKEND >::bufferSize( cap_in );
+				// allocate raw, assigned, and stack arrays
 				const RC rc = grb::utils::alloc(
 					"grb::Vector< T, BSP1D, C > (initialize)", sstream.str(),
-					_raw, cap_in, true, _raw_deleter,                      // allocate raw array
+					_raw, cap_in, true, _raw_deleter,
 					new_assigned,
 						internal::Coordinates< _GRB_BSP1D_BACKEND >::arraySize( cap_in ),
 						true,
-						_assigned_deleter,                             // allocate assigned array
-					_buffer, bufferSize, true, _buffer_deleter             // allocate (stack) buffer
+						_assigned_deleter,
+					_buffer, bufferSize, true, _buffer_deleter
 				);
 				// identify error and throw
 				if( rc == OUTOFMEM ) {
@@ -2439,6 +2440,26 @@ namespace grb {
 			// done
 		}
 
+		/**
+		 * Copy-assignment.
+		 *
+		 * Same performance semantics as #grb::set.
+		 *
+		 * \warning Errors will be thrown as standard C++ exceptions. Users who rather
+		 *          not deal with exceptions are encouraged to use #grb::set directly.
+		 *
+		 * \internal Dispatches to #grb::set.
+		 */
+		Vector< D, BSP1D, C > & operator=( Vector< D, BSP1D, C > &x ) {
+			const auto rc = set( *this, x );
+			if( rc != SUCCESS ) {
+				throw std::runtime_error( "grb::set inside copy-constructor: "
+					+ toString( rc )
+				);
+			}
+			return *this;
+		}
+
 		/**
 		 * Assign-from-temporary. This is a \f$ \Theta(1) \f$ operation.
 		 *
diff --git a/include/graphblas/collectives.hpp b/include/graphblas/collectives.hpp
index a99eac739..8ca63fd3e 100644
--- a/include/graphblas/collectives.hpp
+++ b/include/graphblas/collectives.hpp
@@ -28,13 +28,19 @@
 
 // include template specialisations
 #ifdef _GRB_WITH_REFERENCE
-#include <graphblas/reference/collectives.hpp>
+ #include <graphblas/reference/collectives.hpp>
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/collectives.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/collectives.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include <graphblas/bsp/collectives.hpp>
+ #include <graphblas/bsp/collectives.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include <graphblas/banshee/collectives.hpp>
+ #include <graphblas/banshee/collectives.hpp>
 #endif
 
 // specify default only if requested during compilation
@@ -46,3 +52,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_COLL''
+
diff --git a/include/graphblas/config.hpp b/include/graphblas/config.hpp
index 8ef4e6a8d..d7c2a650f 100644
--- a/include/graphblas/config.hpp
+++ b/include/graphblas/config.hpp
@@ -32,6 +32,9 @@
 #ifdef _GRB_WITH_HYPERDAGS
  #include "graphblas/hyperdags/config.hpp"
 #endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/config.hpp"
+#endif
 #ifdef _GRB_WITH_OMP
  #include "graphblas/omp/config.hpp"
 #endif
diff --git a/include/graphblas/coordinates.hpp b/include/graphblas/coordinates.hpp
index 3ddf662bb..43f5c9845 100644
--- a/include/graphblas/coordinates.hpp
+++ b/include/graphblas/coordinates.hpp
@@ -27,13 +27,17 @@
 
 // now include all specialisations contained in the backend directories:
 #ifdef _GRB_WITH_REFERENCE
-#include <graphblas/reference/coordinates.hpp>
+ #include <graphblas/reference/coordinates.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/coordinates.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
 // #include <graphblas/bsp1d/coordinates.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include <graphblas/banshee/coordinates.hpp>
+ #include <graphblas/banshee/coordinates.hpp>
 #endif
 
 #endif // _H_GRB_COORDINATES
+
diff --git a/include/graphblas/descriptors.hpp b/include/graphblas/descriptors.hpp
index 1fe3f9836..c90cb5c3d 100644
--- a/include/graphblas/descriptors.hpp
+++ b/include/graphblas/descriptors.hpp
@@ -18,7 +18,7 @@
 /**
  * @file
  *
- * Defines the GraphBLAS various descriptors.
+ * Defines all ALP/GraphBLAS descriptors.
  *
  * @author A. N. Yzelman
  * @date 15 March, 2016
@@ -117,11 +117,11 @@ namespace grb {
 		static constexpr Descriptor structural_complement = structural | invert_mask;
 
 		/**
-		 * Indicates that all input vectors to an ALP/GraphBLAS primitive are
-		 * structurally dense.
+		 * Indicates that all input and output vectors to an ALP/GraphBLAS primitive
+		 * are structurally dense.
 		 *
-		 * If a user passes this descriptor but one or more vectors input to the call
-		 * are \em not structurally dense, then #ILLEGAL shall be returned.
+		 * If a user passes this descriptor but one or more vectors to the call are
+		 * \em not structurally dense, then #ILLEGAL shall be returned.
 		 *
 		 * \warning <em>All vectors</em> includes any vectors that operate as masks.
 		 *          Thus if the primitive is to operate with structurally sparse masks
@@ -134,6 +134,10 @@ namespace grb {
 		 *          passing this descriptor to such primitive indicates that also the
 		 *          output vector is structurally dense.
 		 *
+		 * \warning For out-of-place operations with vector output(s), passing this
+		 *          descriptor also demands that the output vectors are already
+		 *          dense.
+		 *
 		 * \warning Vectors with explicit zeroes (under the semiring passed to the
 		 *          related primitive) will be computed with explicitly.
 		 *
@@ -141,6 +145,7 @@ namespace grb {
 		 *   1) less run-time overhead as code handling sparsity is disabled;
 		 *   2) smaller binary sizes as code handling structurally sparse vectors is
 		 *      not emitted (unless required elsewhere).
+		 *
 		 * The consistent use of this descriptor is hence strongly encouraged.
 		 */
 		static constexpr Descriptor dense = 16;
diff --git a/include/graphblas/distribution.hpp b/include/graphblas/distribution.hpp
index 845400337..a382b27ef 100644
--- a/include/graphblas/distribution.hpp
+++ b/include/graphblas/distribution.hpp
@@ -26,10 +26,11 @@
 #include "base/distribution.hpp"
 
 #ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/distribution.hpp"
+ #include "graphblas/bsp1d/distribution.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/distribution.hpp"
+ #include "graphblas/banshee/distribution.hpp"
 #endif
 
 #endif // end `_H_GRB_DISTRIBUTION'
+
diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp
index 22a5bc422..2bcf796aa 100644
--- a/include/graphblas/exec.hpp
+++ b/include/graphblas/exec.hpp
@@ -28,13 +28,19 @@
 
 // include template specialisations
 #ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/exec.hpp"
+ #include "graphblas/reference/exec.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/exec.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/exec.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/exec.hpp"
+ #include "graphblas/bsp1d/exec.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/exec.hpp"
+ #include "graphblas/banshee/exec.hpp"
 #endif
 
 #ifdef _GRB_BACKEND
@@ -45,3 +51,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_EXEC''
+
diff --git a/include/graphblas/hyperdags/README.md b/include/graphblas/hyperdags/README.md
new file mode 100644
index 000000000..4ebfe820f
--- /dev/null
+++ b/include/graphblas/hyperdags/README.md
@@ -0,0 +1,304 @@
+
+This backend gathers meta-data while user programs execute. The actual compute
+logic is executed by a compile-time selected secondary backend, which by default
+is the `reference` backend. The meta-data will be used to generate, at program
+exit, a HyperDAG representation of the executed computation. We foresee two
+possible HyperDAG representations:
+
+ 1. a coarse-grain representation where vertices correspond to a) source
+    containers (vectors or matrices-- not scalars), b) output containers, or
+    c) ALP/GraphBLAS primitives (such as grb::mxv or grb::dot). Hyperedges
+    capture which vertices act as source to operations or outputs in other
+    vertices. Each hyperedge has exactly one source vertex only.
+
+ 2. a fine-grain representation where source vertices correspond to nonzeroes
+    in a source container, not the container as a whole, and likewise for output
+    vertices that correspond to individual elements of output containers. Also
+    there are now many fine-grained operation vertices that are executed by a
+    single ALP/GraphBLAS primitive. For example, a call to grb::vxm will emit
+    two hyperedges for every nonzero in the sparse input matrix.
+
+Only the extraction of a coarse-grained representation is presently implemented.
+
+Usage
+=====
+
+To use the HyperDAG generation backend, follow the following steps. Note that
+steps 1-5 are common to building the general ALP/GraphBLAS template library.
+Steps 6 & 7 showcase the HyperDAG generation using representation no. 1 on the
+tests/unit/dot.cpp unit test.
+
+1. `cd /path/to/ALP/GraphBLAS/root/directory`
+
+2. `./configure --prefix=/path/to/install/directory`
+
+3. `cd build`
+
+4. `make -j && make -j install`
+
+5. `source /path/to/install/directory/bin/setenv`
+
+6. `grbcxx -b hyperdags -g -O0 -Wall -o dot_hyperdag ../tests/unit/dot.cpp`
+
+7. `grbrun -b hyperdags ./dot_hyperdag`
+
+After these steps, something like the following will be produced:
+
+```
+This is functional test ./dot_hyperdag
+Info: grb::init (hyperdags) called.
+Info: grb::init (reference) called.
+Info: grb::finalize (hyperdags) called.
+	 dumping HyperDAG to stdout
+%%MatrixMarket matrix coordinate pattern general
+%	 Source vertices:
+%		 0: container initialised by a call to set no. 0
+%		 1: container initialised by a call to set no. 1
+%		 2: input scalar no. 0
+%		 6: input scalar no. 1
+...more comment lines follow...
+%		 212: input scalar no. 103
+%		 213: user-initialised container no. 0
+%		 214: user-initialised container no. 1
+214 216 428
+0 2
+0 3
+1 0
+1 3
+2 1
+2 3
+...more pins follow...
+213 214
+213 215
+Info: grb::finalize (reference) called.
+Test OK
+```
+
+This output contains the HyperDAG corresponding to the code in the given source
+file, `tests/unit/dot.cpp`. Let us examine it. First, ALP/GraphBLAS will always
+print info (and warning) statements to the standard error stream. These are:
+
+```
+$ grbrun -b hyperdags ./dot_hyperdag 1> /dev/null
+Info: grb::init (hyperdags) called.
+Info: grb::init (reference) called.
+Info: grb::finalize (hyperdags) called.
+	 dumping HyperDAG to stdout
+Info: grb::finalize (reference) called.
+```
+
+These statements indicate which backends are used and when they are
+initialised, respectively, finalised. The info messages indicate that the
+hyperdags backend is used, which, in turn, employs the standard sequential
+reference backend for the actual computations. The second to last message
+reports that as part of finalising the hyperdags backend, it dumps the
+HyperDAG constructed during computations to the stdandard output stream
+(stdout).
+
+The output to stdout starts with
+
+```
+%%MatrixMarket matrix coordinate pattern general
+```
+
+This indicates the HyperDAG is stored using a MatrixMarket format. As the name
+implies, this format stores sparse matrices, so we need a definition of how the
+sparse matrix is mapped back to a HyperDAG. Here, rows correspond to hyperedges
+while columns correspond to vertices.
+
+In the MatrixMarket format, comments are allowed and should start with a `%`.
+The hyperdags backend presently prints which vertices are sources as comment
+lines. Later, also information on the operation and output vertices may be
+added.
+
+After the comments follow the so-called header line:
+
+```
+214 216 428
+```
+
+This indicates that there 214 hyperedges, 216 vertices, and 428 pins in the
+output HyperDAG. What then follows is one line for each of the pins, printed
+as a pair of hypergraph and vertex IDs.
+
+For example, the first two pins contain:
+
+```
+0 2
+0 3
+```
+
+These operate on vertices 2 and 3, which the comments note are an input scalar
+and a non-source vertex, respectively. The corresponding first statements of
+`tests/unit/dot.cpp` are as follows. It stands to reason that vertex 2 thus
+corresponds to the scalar `out` in the below code, while vertex 3 corresponds
+to the scalar output of the `grb::dot`.
+
+```
+	double out = 2.55;
+	grb::Vector< double > left( n );
+	grb::Vector< double > right( n );
+	grb::set( left, 1.5 );
+	grb::set( right, -1.0 );
+	grb::dot( out, left, right, ring );
+```
+
+If this reading is correct, then there should also be two hyperedges connecting
+`left` and `right` to vertex 3, the output of `grb::dot`. Indeed the next four
+pins are
+
+```
+1 0
+1 3
+2 1
+2 3
+```
+
+which indeed correspond to two hyperedges connecting `left` and `right` to the
+output of `grb::dot`. Do note that thus far the HyperDAG is in fact just a DAG,
+given every hyperedge has exectly two pins.
+
+
+Extending the HyperDAGs backend
+===============================
+
+We now briefly visit the implementation of the HyperDAGs backend. The
+implementation of the `hyperdags` `grb::dot` is as follows:
+
+```
+template<
+	Descriptor descr = descriptors::no_operation,
+	class AddMonoid, class AnyOp,
+	typename OutputType, typename InputType1, typename InputType2,
+	typename Coords
+>
+RC dot( OutputType &z,
+	const Vector< InputType1, hyperdags, Coords > &x,
+	const Vector< InputType2, hyperdags, Coords > &y,
+	const AddMonoid &addMonoid = AddMonoid(),
+	const AnyOp &anyOp = AnyOp(),
+	const typename std::enable_if<
+		!grb::is_object< OutputType >::value &&
+		!grb::is_object< InputType1 >::value &&
+		!grb::is_object< InputType2 >::value &&
+		grb::is_monoid< AddMonoid >::value &&
+		grb::is_operator< AnyOp >::value,
+	void >::type * const = nullptr
+) {
+...
+```
+
+The signature of the `grb::dot` follows the specification that is found in
+`include/graphblas/reference/blas1.hpp`-- if we need to add a new primitive,
+the first step is to simply copy the signature from the reference backend and
+then change any container template arguments that read `reference` into
+`hyperdags`. This makes sure that the compiler will select the implementation
+we are providing here whenever it needs to generate code for a dot-product using
+the hyperdags backend.
+
+The source file continues:
+```
+	// always force input scalar to be a new source
+	internal::hyperdags::generator.addSource(
+		internal::hyperdags::SCALAR,
+		&z
+	);
+	...
+```
+
+Here, we recognise that `z` is an input to the algorithm and needs to be
+registered as a source vertex. Recall that by the `grb::dot` specification,
+`z` is indeed computed in-place: `z += < x, y >`.
+
+The source continues with registering the sources and destinations (outputs) of
+the dot-operation itself:
+
+```
+	std::array< const void *, 1 > sourcesP{ &z };
+	std::array< uintptr_t, 2 > sourcesC{ getID( x ), getID( y ) };
+	std::array< uintptr_t, 1 > destinations{ getID( z ) };
+	...
+```
+Note that this records auxiliary scalars using pointers, while ALP/GraphBLAS
+containers are registered using its container ID. With that done, we finally
+record the operation, as follows:
+
+```
+	internal::hyperdags::generator.addOperation(
+		internal::hyperdags::DOT,
+		sourcesP.begin(), sourcesP.end(),
+		sourcesC.begin(), sourcesC.end(),
+		destinations.begin(), destinations.end()
+	);
+	...
+```
+
+Here, the `addOperation` needs to know the type of operation (`DOT`), what its
+sources are (given here by iterator pairs to the `sources` array), and what its
+destinations are (ditto).
+
+The attentive reader will realise that so far no computation has occurred yet--
+we so far only recorded sources and the intended operation. So we finish up
+with actually performing the requested computation, relying fully on the
+reference backend instead of reimplementing things all over again:
+
+```
+	return dot( z,
+		internal::getVector(x), internal::getVector(y),
+		addMonoid, anyOp
+	);
+}
+```
+
+Here, the `internal::getVector` wrapper function retrieves a reference backend
+version of the input vector, and passes that on to the reference backend.
+
+This quick description ignores phases and error codes -- please see some of the
+actual code snippets in the hyperdags backend for error-safe programming
+patterns.
+
+
+Registering new operation and source types
+==========================================
+
+One may want to register a new type of operation vertex or source vertex. For
+this, see `include/graphblas/hyperdags/hyperdags.hpp` and, in the case of source
+vertices, look for the following enum:
+
+```
+enum SourceVertexType {
+	SCALAR,
+	CONTAINER,
+	ITERATOR,
+	USER_INT
+};
+
+const constexpr size_t numSourceVertexTypes = 4;
+
+const constexpr enum SourceVertexType
+	allSourceVertexTypes[ numSourceVertexTypes ] =
+{
+	SCALAR,
+	CONTAINER,
+	ITERATOR,
+	USER_INT
+};
+```
+
+A new type of source vertex should:
+
+1. be added to the enum. While not copied here, every type is conjoined with
+   documentation describing unambiguously where such a source vertex could
+   come from / how and when they are generated;
+
+2. increment numSourceVertexTypes;
+
+3. add the new enum entry to the allSourceVertexTypes array; and, finally
+
+4. the toString function in `src/graphblas/hyperdags/hyperdags.cpp` should be
+   updated.
+
+To add new operation vertex types, the same recipe should be followed, but then
+using the `OperationVertexType` enum and the `numOperationVertexTypes` counter
+and the `allOperationVertexTypes` array; and similarly for output vertex types.
+
diff --git a/include/graphblas/hyperdags/alloc.hpp b/include/graphblas/hyperdags/alloc.hpp
new file mode 100644
index 000000000..4806f694f
--- /dev/null
+++ b/include/graphblas/hyperdags/alloc.hpp
@@ -0,0 +1,58 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides allocators for the hyperdags backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_ALLOC
+#define _H_GRB_HYPERDAGS_ALLOC
+
+
+namespace grb {
+
+	namespace utils {
+
+		namespace internal {
+
+			template<>
+			class Allocator< hyperdags > {
+
+				private:
+
+					/** Prevent initialisation. */
+					Allocator();
+
+				public:
+
+					/** Refer to the standard allocation mechanism. */
+					typedef AllocatorFunctions< _GRB_WITH_HYPERDAGS_USING > functions;
+			};
+
+		} // namespace internal
+
+	}     // namespace utils
+
+} // namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/benchmark.hpp b/include/graphblas/hyperdags/benchmark.hpp
new file mode 100644
index 000000000..23502f33c
--- /dev/null
+++ b/include/graphblas/hyperdags/benchmark.hpp
@@ -0,0 +1,101 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Benchmarker for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 11th of May, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BENCH
+#define _H_GRB_HYPERDAGS_BENCH
+
+#include <graphblas/base/benchmark.hpp>
+#include <graphblas/rc.hpp>
+
+#include "exec.hpp"
+
+
+namespace grb {
+
+	/** \internal Simply wraps around the underlying Benchmarker implementation. */
+	template< enum EXEC_MODE mode >
+	class Benchmarker< mode, hyperdags > :
+		protected Launcher< mode, hyperdags >, protected internal::BenchmarkerBase
+	{
+
+		private:
+
+			typedef Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING > MyBenchmarkerType;
+
+			MyBenchmarkerType benchmarker;
+
+
+		public:
+
+			/** \internal Simple delegation. */
+			Benchmarker(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) :
+				benchmarker( process_id, nprocs, hostname, port )
+			{}
+
+			/** \internal Simple delegation. */
+			template< typename U >
+			RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+				const void * const data_in, const size_t in_size,
+				U &data_out,
+				const size_t inner, const size_t outer,
+				const bool broadcast = false
+			) const {
+				return benchmarker.exec(
+					grb_program,
+					data_in, in_size,
+					data_out,
+					inner, outer,
+					broadcast
+				);
+			}
+
+			/** \internal Simple delegation. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const size_t inner, const size_t outer,
+				const bool broadcast = false
+			) {
+				return benchmarker.exec(
+					grb_program,
+					data_in, data_out,
+					inner, outer,
+					broadcast
+				);
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_BENCH''
+
diff --git a/include/graphblas/hyperdags/blas1.hpp b/include/graphblas/hyperdags/blas1.hpp
new file mode 100644
index 000000000..623174345
--- /dev/null
+++ b/include/graphblas/hyperdags/blas1.hpp
@@ -0,0 +1,2987 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the "level-1" primitives for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS1
+#define _H_GRB_HYPERDAGS_BLAS1
+
+#include <graphblas/vector.hpp>
+
+#include <graphblas/hyperdags/init.hpp>
+
+#include <array>
+
+
+namespace grb {
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AddMonoid, class AnyOp,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< AddMonoid >::value &&
+			grb::is_operator< AnyOp >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = dot< descr >(
+			z, internal::getVector(x), internal::getVector(y),
+			addMonoid, anyOp, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&z
+		);
+		std::array< const void *, 1 > sourcesP{ &z };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		//std::array< const void *, 1 > destinationsP{ &z };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::DOT,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		class Semiring, typename Coords
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Semiring &ring = Semiring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Semiring >::value,
+		void >::type * const = nullptr
+	) {
+		// note: dispatches to the above dot-variant, which will handle the HyperDAG
+		// generation.
+		return dot< descr >(
+			z, x, y,
+			ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+			phase
+		);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T, typename U, typename Coords
+	>
+	RC zip(
+		Vector< std::pair< T, U >, hyperdags, Coords > &z,
+		const Vector< T, hyperdags, Coords > &x,
+		const Vector< U, hyperdags, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = zip< descr >(
+			internal::getVector(z),
+			internal::getVector(x), internal::getVector(y),
+			phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::ZIP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T, typename U, typename Coords
+	>
+	RC unzip(
+		Vector< T, hyperdags, Coords > &x,
+		Vector< U, hyperdags, Coords > &y,
+		const Vector< std::pair< T, U >, hyperdags, Coords > &in,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = unzip< descr >(
+			internal::getVector(x), internal::getVector(y), internal::getVector(in),
+			phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(in) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(in) )
+		};
+		std::array< uintptr_t, 2 > destinations{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::UNZIP_VECTOR_VECTOR_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z),
+			internal::getVector(x), internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename InputType, typename IOType, typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		const RC ret = foldr< descr >( internal::getVector(x), beta, monoid, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		//std::array< const void *, 1 > destinationsP{ &beta };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_SCALAR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename InputType, typename MaskType, typename IOType, typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldr< descr >( x, beta, monoid, phase );
+		}
+		const RC ret = foldr< descr >(
+			internal::getVector(x), internal::getVector(m),
+			beta, monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) )
+		};
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar output is ignored
+		// std::array< const void *, 1 > destinationsP{ &beta };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_MASK_SCALAR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename IOType, typename InputType, typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		const RC ret = foldr< descr >( alpha, internal::getVector(y), monoid, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_APLHA_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		 class OP, typename IOType, typename InputType, typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = foldr< descr >( alpha, internal::getVector(y), op, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_APLHA_VECTOR_OPERATOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename IOType, typename InputType, typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		Vector< IOType, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< OP >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value,
+		void >::type * = nullptr
+	) {
+		const RC ret = foldr< descr >(
+			internal::getVector(x),
+			internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_VECTOR_OPERATOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		 Descriptor descr = descriptors::no_operation, class OP,
+		 typename IOType, typename MaskType, typename InputType, typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		Vector< IOType, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< OP >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< IOType >::value,
+		void >::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldr< descr >( x, y, op, phase );
+		}
+		const RC ret = foldr< descr >(
+			internal::getVector(x),
+			internal::getVector(m),
+			internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, typename IOType, typename InputType, typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		Vector< IOType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< Monoid >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value,
+		void >::type * = nullptr
+	) {
+		const RC ret = foldr< descr >(
+			internal::getVector(x), internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename IOType, typename MaskType, typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		Vector< IOType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< Monoid >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value,
+		void >::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldr< descr >( x, y, monoid, phase );
+		}
+		const RC ret = foldr< descr >(
+			internal::getVector(x), internal::getVector(m),
+			internal::getVector(y), monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename InputType, typename IOType, typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = foldl< descr >(
+			x, internal::getVector(y), monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&x
+		);
+		std::array< const void *, 1 > sourcesP{ &x };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar outputs are ignored
+		//std::array< const void *, 1 > destinationsP{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_SCALAR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename InputType, typename IOType, typename MaskType,
+		typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return foldl< descr >( x, y, monoid, phase );
+		}
+		const RC ret = foldl< descr >(
+			x, internal::getVector(y), internal::getVector(mask),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&x
+		);
+		std::array< const void *, 1 > sourcesP{ &x };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(mask) )
+		};
+		std::array< uintptr_t, 0 > destinations{};
+		// NOTE scalar outputs are ignored
+		// std::array< const void * const, 1 > destinationsP{ &x };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_SCALAR_VECTOR_MASK_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Op, typename IOType, typename InputType, typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value,
+		void >::type * = nullptr
+	) {
+		const RC ret = foldl< descr >( internal::getVector(x), beta, op, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Op,
+		typename IOType, typename MaskType, typename InputType, typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value,
+		void >::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldl< descr >( x, beta, op, phase );
+		}
+		const RC ret = foldl< descr >(
+			internal::getVector(x), internal::getVector(m),
+			beta, op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename IOType, typename InputType, typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const InputType beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * = nullptr
+	) {
+		const RC ret = foldl< descr >( internal::getVector(x), beta, monoid, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		 Descriptor descr = descriptors::no_operation, class Monoid,
+		 typename IOType, typename MaskType, typename InputType,
+		 typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType &beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldl< descr >( x, beta, monoid, phase );
+		}
+		const RC ret = foldl< descr >(
+			internal::getVector(x), internal::getVector(m),
+			beta, monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template <
+		Descriptor descr = descriptors::no_operation,
+		class Monoid, typename IOType, typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * = nullptr
+	) {
+		const RC ret = foldl< descr >(
+			internal::getVector(x), internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template <
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename IOType, typename MaskType, typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldl< descr >( x, y, op, phase );
+		}
+		const RC ret = foldl< descr >(
+			internal::getVector(x), internal::getVector(m),
+			internal::getVector(y), op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename IOType, typename MaskType, typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return foldl< descr >( x, y, monoid, phase );
+		}
+		const RC ret = foldl< descr >(
+			internal::getVector(x),internal::getVector(m),
+			internal::getVector(y), monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP, typename IOType, typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, hyperdags, Coords > &x,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * = nullptr
+	) {
+		const RC ret = foldl< descr >(
+			internal::getVector(x), internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::FOLDL_VECTOR_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template< typename Func, typename DataType, typename Coords >
+	RC eWiseLambda(
+		const Func f, const Vector< DataType, hyperdags, Coords > &x
+	) {
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISELAMBDA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return eWiseLambda( f, internal::getVector(x) );
+	}
+
+	namespace internal {
+
+		/** \internal This is the end recursion */
+		template<
+			typename Func, typename DataType,
+			typename Coords
+		>
+		RC hyperdag_ewisevector(
+			const Func f,
+			const Vector< DataType, grb::hyperdags, Coords > &x,
+			std::vector< uintptr_t > &sources,
+			std::vector< uintptr_t > &destinations
+		) {
+			const RC ret = grb::eWiseLambda( f, internal::getVector(x) );
+			if( ret != grb::SUCCESS ) { return ret; }
+			if( size( internal::getVector(x) ) == 0 ) { return ret; }
+			std::array< const void *, 0 > sourcesP{};
+			sources.push_back( getID( internal::getVector(x) ) );
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::EWISELAMBDA_FUNC_VECTOR,
+				sourcesP.cbegin(), sourcesP.cend(),
+				sources.cbegin(), sources.cend(),
+				destinations.cbegin(), destinations.cend()
+			);
+			return ret;
+		}
+
+		/** \internal This is the base recursion */
+		template<
+			typename Func, typename DataType1, typename DataType2,
+			typename Coords, typename... Args
+		>
+		RC hyperdag_ewisevector(
+			const Func f,
+			const Vector< DataType1, grb::hyperdags, Coords > &x,
+			std::vector< uintptr_t > &sources,
+			std::vector< uintptr_t > &destinations,
+			const Vector< DataType2, grb::hyperdags, Coords > &y,
+			Args... args
+		) {
+			sources.push_back( getID( internal::getVector(y) ) );
+			destinations.push_back( getID( internal::getVector(y) ) );
+			return hyperdag_ewisevector( f, x, sources, destinations, args... );
+		}
+
+	} // end namespace grb::internal
+
+	template<
+		typename Func,
+		typename DataType1, typename DataType2, typename Coords,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Vector< DataType1, hyperdags, Coords > &x,
+		const Vector< DataType2, hyperdags, Coords > &y,
+		Args const &... args
+	) {
+		std::vector< uintptr_t > sources, destinations;
+		return internal::hyperdag_ewisevector(
+			f, x, sources, destinations, y, args...
+		);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), alpha, beta,
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(z) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 1 > sourcesC{
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), alpha, beta,
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 1 > sourcesC{
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, beta, op, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			alpha, beta,
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(z) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 1 > sourcesC{
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{
+			getID( internal::getVector(z) )
+		};
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, beta, monoid, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			alpha, beta,
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value
+			&& !grb::is_object< InputType1 >::value
+			&& !grb::is_object< InputType2 >::value
+			&& grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(x), beta,
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP, typename OutputType,
+		typename InputType1, typename InputType2, typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value
+			&& grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), alpha, internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(z) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, monoid, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			internal::getVector(x), beta,
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename MaskType, typename InputType1,
+		typename InputType2, typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, op, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			internal::getVector(x), beta,
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, monoid, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			alpha, internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename MaskType, typename InputType1,
+		typename InputType2, typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, op, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			alpha, internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, op, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			internal::getVector(x), internal::getVector(y),
+			op, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z),
+			internal::getVector(x), beta,
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z),
+			alpha, internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2, typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, monoid, phase );
+		}
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z), internal::getVector(mask),
+			internal::getVector(x), internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Monoid,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getVector(z),
+			internal::getVector(x), internal::getVector(y),
+			monoid, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			internal::getVector(a), internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 5 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, x, gamma, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			internal::getVector(a), internal::getVector(x), gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 1 > sourcesP{ &gamma };
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), alpha,
+			internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring, typename InputType1,
+		typename InputType2, typename InputType3, typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			internal::getVector(a), chi, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&chi
+		);
+		std::array< const void *, 1 > sourcesP{ &chi };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_CHI,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			internal::getVector(a), chi, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&chi
+		);
+		std::array< const void *, 1 > sourcesP{ &chi };
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, beta, gamma, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			internal::getVector(a), beta,  gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 2 > sourcesP{ &beta, &gamma };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_BETA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename MaskType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, x, gamma, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, internal::getVector(x), gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &gamma };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, beta, y, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, beta, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename MaskType, typename InputType1,
+		typename InputType2, typename InputType3, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, beta, gamma, ring, phase );
+		}
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, beta, gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 3 > sourcesP{ &alpha, &beta, &gamma };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			internal::getVector(a), internal::getVector(x), gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 1 > sourcesP{ &gamma };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			internal::getVector(a), beta, gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(z) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 2 > sourcesP{ &beta, &gamma };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		 void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			alpha, internal::getVector(x), gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &gamma };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename InputType3, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			alpha, beta, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename InputType3, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			alpha, beta, gamma,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(z) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&gamma
+		);
+		std::array< const void *, 3 > sourcesP{ &alpha, &beta, &gamma };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(z) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \warning This function is deprecated */
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename InputType3,
+		typename OutputType, typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &a,
+		const Vector< InputType2, hyperdags, Coords > &x,
+		const Vector< InputType3, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMulAdd< descr >(
+			internal::getVector(z),
+			internal::getVector(a), internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(a) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z), internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z),
+			alpha, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(y) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z),
+			internal::getVector(x), beta,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( internal::getVector(x) ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_BETA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z),
+			alpha, beta,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 1 > sourcesC{
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_ALPHA_BETA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename MaskType, typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMul< descr >( z, x, y, ring, phase );
+		}
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z),
+			internal::getVector(m), internal::getVector(x), internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+			internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename MaskType, typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, y, ring, phase );
+		}
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, internal::getVector(y),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		std::array< const void *, 1 > sourcesP{ &alpha };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+			internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename MaskType, typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMul< descr >( z, x, beta, ring, phase );
+		}
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z), internal::getVector(m),
+			internal::getVector(x), beta,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 1 > sourcesP{ &beta };
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+			internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename MaskType, typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, hyperdags, Coords > &z,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(m) ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, beta, ring, phase );
+		}
+		const RC ret = eWiseMul< descr >(
+			internal::getVector(z), internal::getVector(m),
+			alpha, beta,
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&alpha
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&beta
+		);
+		std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(m) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+			internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/blas2.hpp b/include/graphblas/hyperdags/blas2.hpp
new file mode 100644
index 000000000..e2e781be2
--- /dev/null
+++ b/include/graphblas/hyperdags/blas2.hpp
@@ -0,0 +1,687 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the BLAS-2 API for the hypergraphs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS2
+#define _H_GRB_HYPERDAGS_BLAS2
+
+#include <graphblas/matrix.hpp>
+
+#include <graphblas/hyperdags/init.hpp>
+
+#include <array>
+
+
+namespace grb {
+
+	template<
+		Descriptor descr = descriptors::no_operation, class Ring,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Matrix< InputType2, hyperdags > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return vxm< descr >( u, v, A, ring, phase );
+		}
+		const RC ret = vxm< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getVector(v), internal::getMatrix(A),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Matrix< InputType2, hyperdags > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return vxm< descr >( u, v, A, add, mul, phase );
+		}
+		const RC ret = vxm< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getVector(v), internal::getMatrix(A),
+			add, mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Matrix< InputType2, hyperdags > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = vxm< descr >(
+			internal::getVector(u),
+			internal::getVector(v), internal::getMatrix(A),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_VECTOR_VECTOR_MATRIX_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename InputType3 = bool,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Matrix< InputType2, hyperdags > &A,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(mask) ) == 0 ) {
+			return mxv< descr >( u, A, v, ring, phase );
+		}
+		const RC ret = mxv< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getMatrix(A), internal::getVector(v),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4, typename Coords
+	>
+	RC mxv(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Matrix< InputType2, hyperdags > &A,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Vector< InputType4, hyperdags, Coords > &v_mask,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(v_mask) ) == 0 ) {
+			return mxv< descr >( u, mask, A, v, ring, phase );
+		}
+		const RC ret = mxv< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getMatrix(A), internal::getVector(v), internal::getVector(v_mask),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::vector< uintptr_t > sourcesC{
+			getID( internal::getVector(v_mask) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getVector(u) )
+		};
+		if( size( internal::getVector(mask) ) > 0 ) {
+			sourcesC.push_back( getID( internal::getVector(mask) ) );
+		}
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4, typename Coords
+	>
+	RC mxv(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Matrix< InputType2, hyperdags > &A,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Vector< InputType4, hyperdags, Coords > &v_mask,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(v_mask) ) == 0 ) {
+			return mxv< descr >( u, mask, A, v, add, mul, phase );
+		}
+		const RC ret = mxv< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getMatrix(A), internal::getVector(v), internal::getVector(v_mask),
+			add, mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::vector< uintptr_t > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getVector(v_mask) ),
+			getID( internal::getVector(u) )
+		};
+		if( size( internal::getVector(mask) ) > 0 ) {
+			sourcesC.push_back( getID( internal::getVector(mask) ) );
+		}
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Matrix< InputType2, hyperdags > &A,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = mxv< descr >(
+			internal::getVector(u),
+			internal::getMatrix(A), internal::getVector(v),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXV_VECTOR_MATRIX_VECTOR_RING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		typename IOType, typename InputType1, typename InputType2, typename Coords
+	>
+	RC mxv(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Matrix< InputType2, hyperdags > &A,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = mxv< descr >(
+			internal::getVector(u),
+			internal::getMatrix(A), internal::getVector(v),
+			add, mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	/** \internal Uses a direct implementation. */
+	template<
+		typename Func, typename DataType
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, hyperdags > &A
+	) {
+		const RC ret = eWiseLambda( f, internal::getMatrix(A) );
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 0 > destinations{};
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISELAMBDA_FUNC_MATRIX,
+			sourcesP.cbegin(), sourcesP.cend(),
+			sourcesC.cbegin(), sourcesC.cend(),
+			destinations.cbegin(), destinations.cend()
+		);
+		return ret;
+	}
+
+	namespace internal {
+
+		/** \internal This is the end recursion */
+		template<
+			typename Func, typename DataType
+		>
+		RC hyperdag_ewisematrix(
+			const Func f,
+			const Matrix< DataType, grb::hyperdags > &A,
+			std::vector< uintptr_t > &sources,
+			std::vector< uintptr_t > &destinations
+		) {
+			const RC ret = grb::eWiseLambda( f, internal::getMatrix(A) );
+			if( ret != SUCCESS ) { return ret; }
+			if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+			std::array< const void *, 0 > sourcesP{};
+			sources.push_back( getID( internal::getMatrix(A) ) );
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::EWISELAMBDA_FUNC_MATRIX,
+				sourcesP.cbegin(), sourcesP.cend(),
+				sources.cbegin(), sources.cend(),
+				destinations.cbegin(), destinations.cend()
+			);
+			return ret;
+		}
+
+		/** \internal This is the base recursion */
+		template<
+			typename Func, typename DataType1, typename DataType2,
+			typename Coords, typename... Args
+		>
+		RC hyperdag_ewisematrix(
+			const Func f,
+			const Matrix< DataType1, grb::hyperdags > &A,
+			std::vector< uintptr_t > &sources,
+			std::vector< uintptr_t > &destinations,
+			const Vector< DataType2, grb::hyperdags, Coords > &x,
+			Args... args
+		) {
+			sources.push_back( getID( internal::getVector(x) ) );
+			destinations.push_back( getID( internal::getVector(x) ) );
+			return hyperdag_ewisematrix( f, A, sources, destinations, args... );
+		}
+
+	} // end namespace grb::internal
+
+	/** \internal Implements the recursive case */
+	template<
+		typename Func,
+		typename DataType1, typename DataType2,
+		typename Coords, typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType1, hyperdags > &A,
+		const Vector< DataType2, hyperdags, Coords > &x,
+		Args... args
+	) {
+		std::vector< uintptr_t > sources, destinations;
+		return internal::hyperdag_ewisematrix(
+			f, A, sources, destinations, x, args...
+		);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4, typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Vector< InputType4, hyperdags, Coords > &v_mask,
+		const Matrix< InputType2, hyperdags > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(v_mask) ) == 0 ) {
+			return vxm< descr >( u, mask, v, A, ring, phase );
+		}
+		const RC ret = vxm< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getVector(v), internal::getVector(v_mask), internal::getMatrix(A),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::vector< uintptr_t > sourcesC{
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v_mask) ),
+			getID( internal::getVector(u) )
+		};
+		if( size( internal::getVector(mask) ) > 0 ) {
+			sourcesC.push_back( getID( internal::getVector(mask) ) );
+		}
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		typename IOType, typename InputType1, typename InputType2,
+		typename InputType3, typename InputType4, typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType3, hyperdags, Coords > &mask,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Vector< InputType4, hyperdags, Coords > &v_mask,
+		const Matrix< InputType2, hyperdags > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( internal::getVector(v_mask) ) == 0 ) {
+			return vxm< descr >( u, mask, v, A, add, mul, phase );
+		}
+		const RC ret = vxm< descr >(
+			internal::getVector(u), internal::getVector(mask),
+			internal::getVector(v), internal::getVector(v_mask), internal::getMatrix(A),
+			add, mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::vector< uintptr_t > sourcesC{
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(v_mask) ),
+			getID( internal::getVector(u) )
+		};
+		if( size( internal::getVector(mask) ) == 0 ) {
+			sourcesC.push_back( getID( internal::getVector(mask) ) );
+		}
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid, class MultiplicativeOperator,
+		typename IOType, typename InputType1, typename InputType2, typename Coords
+	>
+	RC vxm(
+		Vector< IOType, hyperdags, Coords > &u,
+		const Vector< InputType1, hyperdags, Coords > &v,
+		const Matrix< InputType2, hyperdags > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = vxm< descr >(
+			internal::getVector(u),
+			internal::getVector(v), internal::getMatrix(A),
+			add, mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(u) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
new file mode 100644
index 000000000..9448f5f57
--- /dev/null
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -0,0 +1,334 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the BLAS-3 API for the hypergraphs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS3
+#define _H_GRB_HYPERDAGS_BLAS3
+
+#include <graphblas/phase.hpp>
+#include <graphblas/matrix.hpp>
+
+#include <graphblas/hyperdags/init.hpp>
+
+#include <array>
+
+
+namespace grb {
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, hyperdags > &A,
+		const Matrix< InputType2, hyperdags > &B,
+		const MulMonoid &mulmono,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< MulMonoid >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getMatrix( C ),
+			internal::getMatrix( A ), internal::getMatrix( B ),
+			mulmono, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(B) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		class Operator
+	>
+	RC eWiseApply(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+		const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+		const Operator &mulOp,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = eWiseApply< descr >(
+			internal::getMatrix( C ),
+			internal::getMatrix( A ), internal::getMatrix( B ),
+			mulOp, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(B) ),
+			getID( internal::getMatrix(C) ),
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation, typename OutputType,
+		typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		class Semiring
+	>
+	RC mxm(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+		const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+		const Semiring &ring = Semiring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Semiring >::value, void
+		>::type * const = nullptr
+	) {
+		const RC ret = mxm< descr >( internal::getMatrix( C ),
+			internal::getMatrix( A ), internal::getMatrix( B ),
+			ring, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(B) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		class Operator, class Monoid
+	>
+	RC mxm(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+		const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+		const Monoid &addM,
+		const Operator &mulOp,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		const RC ret = mxm< descr >(
+			internal::getMatrix( C ),
+			internal::getMatrix( A ), internal::getMatrix( B ),
+			addM, mulOp, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(B) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::MXM_MATRIX_MATRIX_MATRIX_MONOID,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename RIT, typename CIT, typename NIT,
+		typename Coords, class Operator
+	>
+	RC outer(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &A,
+		const Vector< InputType1, hyperdags, Coords > &u,
+		const Vector< InputType2, hyperdags, Coords > &v,
+		const Operator &mul = Operator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< Operator >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< OutputType >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = outer< descr >(
+			internal::getMatrix( A ),
+			internal::getVector( u ), internal::getVector( v ),
+			mul, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(u) ),
+			getID( internal::getVector(v) ),
+			getID( internal::getMatrix(A) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::OUTER,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename InputType3, typename RIT, typename CIT, typename NIT,
+		typename Coords
+	>
+	RC zip(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &A,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Vector< InputType3, hyperdags, Coords > &z,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = zip< descr >(
+			internal::getMatrix( A ),
+			internal::getVector( x ), internal::getVector( y ),
+			internal::getVector( z ),
+			phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 4 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(z) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT,
+		typename Coords
+	>
+	RC zip(
+		Matrix< void, hyperdags, RIT, CIT, NIT > &A,
+		const Vector< InputType1, hyperdags, Coords > &x,
+		const Vector< InputType2, hyperdags, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = zip< descr >(
+			internal::getMatrix( A ),
+			internal::getVector( x ), internal::getVector( y ),
+			phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(y) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::ZIP_MATRIX_VECTOR_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/collectives.hpp b/include/graphblas/hyperdags/collectives.hpp
new file mode 100644
index 000000000..6102db382
--- /dev/null
+++ b/include/graphblas/hyperdags/collectives.hpp
@@ -0,0 +1,128 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the collectives API for the HyperDAGs backend
+ *
+ * Copies the reference implementation
+ *
+ * @author A. N. Yzelman & J. M. Nash
+ * @date 12th of April, 2017
+ */
+
+#ifndef _H_GRB_HYPERDAGS_COLL
+#define _H_GRB_HYPERDAGS_COLL
+
+#include <type_traits>
+
+#include <graphblas/base/collectives.hpp>
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value of the same type as the first "    \
+		"domain of the given operator.\n"                                      \
+		"* Possible fix 3 | Ensure the operator given to this call to " y " h" \
+		"as all of its domains equal to each other.\n"                         \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	template<>
+	class collectives< hyperdags > {
+
+		private:
+
+			/** Disallow instantiation of this class. */
+			collectives() {}
+
+		public:
+
+			/**
+			 * Implementation details: the reference implementation has a single user
+			 * process, so this call is a no-op.
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC allreduce(
+				IOType &inout, const Operator op = Operator()
+			) {
+			return grb::collectives<grb::_GRB_WITH_HYPERDAGS_USING>::allreduce(
+				inout, op
+			);
+		}
+
+			/**
+			 * Implementation details: the reference implementation has a single user
+			 * process, so this call is a no-op.
+			 */
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC reduce(
+				IOType &inout, const size_t root = 0, const Operator op = Operator()
+			) {
+				// static checks
+				return grb::collectives< grb::_GRB_WITH_HYPERDAGS_USING >::reduce(
+					inout, root, op
+				);
+			}
+
+			/**
+			 * Implementation details: the reference implementation has a single user
+			 * process, so this call is a no-op.
+			 */
+			template< typename IOType >
+			static RC broadcast( IOType &inout, const size_t root = 0 ) {
+				return grb::collectives<grb::_GRB_WITH_HYPERDAGS_USING>::broadcast(
+					inout, root
+				);
+			}
+
+			/** Implementation details: in a single user processes, this is a no-op. */
+			template< Descriptor descr = descriptors::no_operation, typename IOType >
+			static RC broadcast(
+				IOType * inout, const size_t size, const size_t root = 0
+			) {
+				return grb::collectives<grb::_GRB_WITH_HYPERDAGS_USING>::broadcast(
+					inout, size, root
+				);
+			}
+
+	}; // end class `collectives< hyperdags >'
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_COLL''
+
diff --git a/include/graphblas/hyperdags/config.hpp b/include/graphblas/hyperdags/config.hpp
new file mode 100644
index 000000000..dbf0cc1ca
--- /dev/null
+++ b/include/graphblas/hyperdags/config.hpp
@@ -0,0 +1,107 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Contains the configuration parameters for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January 2022.
+ */
+
+#ifndef _H_GRB_HYPERDAGS_CONFIG
+#define _H_GRB_HYPERDAGS_CONFIG
+
+#include <graphblas/config.hpp>
+
+#ifndef _GRB_WITH_HYPERDAGS_USING
+ #error "_GRB_WITH_HYPERDAGS_USING must be defined"
+#endif
+
+
+namespace grb {
+
+	namespace config {
+
+		/**
+		 * The implementation details of the #grb::hyperdag backend.
+		 *
+		 * Since the HyperDAGs backend simply intercepts primitive calls and relies
+		 * on a second backend for its functional execution, this class simply
+		 * delegates all fields to that underlying backend.
+		 *
+		 * \note The user documentation only specifies the fields that under some
+		 *       circumstances may benefit from a user adapting it. For viewing all
+		 *       fields, please see the developer documentation.
+		 *
+		 * \note Adapting the fields should be done with care and may require
+		 *       re-compilation and re-installation of the ALP framework.
+		 */
+		template<>
+		class IMPLEMENTATION< hyperdags > {
+
+			public:
+
+				/**
+				 * @returns The default allocation policy for private memory regions of the
+				 *          underlying backend.
+				 */
+				static constexpr ALLOC_MODE defaultAllocMode() {
+					return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::defaultAllocMode();
+				}
+
+				/**
+				 * @returns The default allocation policy for shared memory regions of the
+				 *          underlying backend.
+				 */
+				static constexpr ALLOC_MODE sharedAllocMode() {
+					return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::sharedAllocMode();
+				}
+
+				/**
+				 * \internal
+				 * @returns The default vector coordinates instance of the underlying
+				 *          backend.
+				 *
+				 * \note This is an extension for compatability with the reference and BSP1D
+				 *       backends.
+				 * \endinternal
+				 */
+				static constexpr Backend coordinatesBackend() {
+					return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::coordinatesBackend();
+				}
+
+				/**
+				 * \internal
+				 * @returns The fixed vector capacity property of the underlying
+				 *          implementation.
+				 * \endinternal
+				 */
+				static constexpr bool fixedVectorCapacities() {
+					return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::
+						fixedVectorCapacities();
+				}
+
+		};
+
+	}
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/exec.hpp b/include/graphblas/hyperdags/exec.hpp
new file mode 100644
index 000000000..376e78b5b
--- /dev/null
+++ b/include/graphblas/hyperdags/exec.hpp
@@ -0,0 +1,104 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Launcher for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_EXEC
+#define _H_GRB_HYPERDAGS_EXEC
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/exec.hpp>
+
+
+namespace grb {
+
+	/**
+	 * No implementation notes.
+	 */
+	template< EXEC_MODE mode >
+	class Launcher< mode, hyperdags > {
+
+		private:
+
+			/**
+			 * Rely on underlying backend.
+			 */
+			typedef Launcher< mode, _GRB_WITH_HYPERDAGS_USING > MyLauncherType;
+
+			/**
+			 * Instantiate the sub-backend.
+			 */
+			MyLauncherType launcher;
+
+
+		public:
+
+			/**
+			 * Default constructor.
+			 *
+			 * Simply calls that of the underlying constructor.
+			 */
+			Launcher(
+				const size_t process_id = 0, const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) : launcher( process_id, nprocs, hostname, port ) {}
+
+			/**
+			 * Variable input-size execution.
+			 *
+			 * Simply calls underlying launcher.
+			 */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in,
+				const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) {
+				return launcher.exec( grb_program, data_in, in_size, data_out, broadcast );
+			}
+
+			/**
+			 * Fixed-size execution.
+			 *
+			 * Simply calls underlying launcher.
+			 */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in,
+				U &data_out,
+				const bool broadcast = false
+			) {
+				return launcher.exec( grb_program, data_in, data_out, broadcast );
+			}
+
+	};
+
+}
+
+#endif
+
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
new file mode 100644
index 000000000..4ef0e0059
--- /dev/null
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -0,0 +1,1305 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides mechanisms to track HyperDAG representations of ALP programs
+ *
+ * @author A. N. Yzelman
+ * @date 1st of February, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_STATE
+#define _H_GRB_HYPERDAGS_STATE
+
+#include <map>
+#include <set>
+#include <vector>
+#include <ostream>
+#include <iostream>
+#include <type_traits>
+
+#include <assert.h>
+
+
+namespace grb {
+
+	namespace internal {
+
+		namespace hyperdags {
+
+			/** \internal The three vertex types in a HyperDAG */
+			enum VertexType {
+				SOURCE,
+				OPERATION,
+				OUTPUT
+			};
+
+			// 1: all source vertex definitions
+
+			/** \internal The types of source vertices that may be generated. */
+			enum SourceVertexType {
+
+				/**
+				 * \internal Scalars are always handled as a new source. We do not track
+				 * whether the same scalars are re-used, because we cannot reliably do so
+				 * due to a lack of an grb::Scalar.
+				 */
+				SCALAR,
+
+				/**
+				 * \internal The source is a container managed by ALP.
+				 */
+				CONTAINER,
+
+				/**
+				 * \internal The source is an iterator passed to ALP.
+				 */
+				ITERATOR,
+
+				/**
+				 * \internal The source is a user integer passed to ALP, usually signifying
+				 *           an index or a size.
+				 */
+				USER_INT
+
+			};
+
+			/** \internal The number of source vertex types. */
+			const constexpr size_t numSourceVertexTypes = 4;
+
+			/** \internal An array of all source vertex types. */
+			const constexpr enum SourceVertexType
+				allSourceVertexTypes[ numSourceVertexTypes ] =
+			{
+				SCALAR,
+				CONTAINER,
+				ITERATOR,
+				USER_INT
+			};
+
+			/** \internal @returns The type, as a string, of a source vertex. */
+			std::string toString( const enum SourceVertexType type ) noexcept;
+
+			/** \internal A source vertex. */
+			class SourceVertex {
+
+				private:
+
+					/** \internal The type of source */
+					enum SourceVertexType type;
+
+					/** \internal The ID amongst vertices of the same type */
+					size_t local_id;
+
+					/** \internal The global ID of the vertex */
+					size_t global_id;
+
+
+				public:
+
+					/**
+					 * \internal The default source vertex constructor.
+					 *
+					 * @param[in] type The type of the vertex.
+					 * @param[in] lid  The ID of vertices of the same type.
+					 * @param[in] gid  The global ID of the vertex.
+					 */
+					SourceVertex(
+						const enum SourceVertexType type,
+						const size_t lid, const size_t gid
+					) noexcept;
+
+					/** \internal @returns The vertex type. */
+					enum SourceVertexType getType() const noexcept;
+
+					/** \internal @returns The type ID. */
+					size_t getLocalID() const noexcept;
+
+					/** \internal @returns The global ID. */
+					size_t getGlobalID() const noexcept;
+
+			};
+
+			/** \internal Helps create a new source vertex */
+			class SourceVertexGenerator {
+
+				private:
+
+					/** \internal Map of next local IDs. */
+					std::map< enum SourceVertexType, size_t > nextID;
+
+
+				public:
+
+					/** \internal Default constructor. */
+					SourceVertexGenerator();
+
+					/**
+					 * \internal
+					 *
+					 * @param[in] type the type of source vertex
+					 * @param[in] id   a unique global ID
+					 *
+					 * @returns a new source vertex with an unique local ID
+					 *
+					 * \endinternal
+					 */
+					SourceVertex create( const SourceVertexType type, const size_t id );
+
+					/**
+					 * \internal
+					 *
+					 * @returns The total number of source vertex generated of any type.
+					 *
+					 * \endinternal
+					 */
+					size_t size() const;
+
+			};
+
+			// 2: everything related to output vertices
+
+			/** \internal The types of output vertices that may be generated. */
+			enum OutputVertexType {
+
+				/**
+				 * \internal The output is an ALP container.
+				 */
+				CONTAINER_OUTPUT
+
+			};
+
+			/** \internal The number of distinct output vertex types. */
+			const constexpr size_t numOutputVertexTypes = 1;
+
+			/** \internal An array of output vertex types. */
+			const constexpr enum OutputVertexType
+				allOutputVertexTypes[ numOutputVertexTypes ] =
+			{
+				CONTAINER_OUTPUT
+			};
+
+			/** \internal @returns A string form of a given output vertex type. */
+			std::string toString( const enum OutputVertexType type ) noexcept;
+
+			/** \internal An output vertex. */
+			class OutputVertex {
+
+				private:
+
+					/** \internal The type of the output */
+					enum OutputVertexType type;
+
+					/** \internal The output vertex ID */
+					const size_t local_id;
+
+					/** \internal The global vertex ID */
+					const size_t global_id;
+
+
+				public:
+
+					/**
+					 * \internal Default constructor.
+					 *
+					 * @param[in] lid The ID within vertices of this type.
+					 * @param[in] gid The global vertex ID.
+					 *
+					 * Recall there is only one output vertex type, hence the precise type is
+					 * not a constructor argument.
+					 */
+					OutputVertex( const size_t lid, const size_t gid ) noexcept;
+
+					/** \internal @returns The type of this output vertex. */
+					enum OutputVertexType getType() const noexcept;
+
+					/** \internal @returns The ID amongst vertices of the same type. */
+					size_t getLocalID() const noexcept;
+
+					/** \internal @returns The ID amongst all vertices. */
+					size_t getGlobalID() const noexcept;
+
+			};
+
+			/** \internal Helps create output vertices. */
+			class OutputVertexGenerator {
+
+				private:
+
+					/** \internal Keeps track of the next output vertex ID. */
+					size_t nextID;
+
+
+				public:
+
+					/** \internal Default constructor. */
+					OutputVertexGenerator() noexcept;
+
+					/**
+					 * \internal
+					 *
+					 * @param[in] id a unique global ID
+					 *
+					 * @returns a new output vertex with an unique local ID
+					 *
+					 * \endinternal
+					 */
+					OutputVertex create( const size_t id );
+
+					/**
+					 * \internal
+					 *
+					 * @returns The total number of output vertices generated.
+					 *
+					 * \endinternal
+					 */
+					size_t size() const noexcept;
+
+			};
+
+			// 3: everything related to operation vertices
+
+			/** \internal Which operation an OperationVertex encodes. */
+			enum OperationVertexType {
+
+				NNZ_VECTOR,
+
+				NNZ_MATRIX,
+
+				CLEAR_VECTOR,
+
+				SET_VECTOR_ELEMENT,
+
+				DOT,
+
+				SET_USING_VALUE,
+
+				SET_USING_MASK_AND_VECTOR,
+
+				SET_USING_MASK_AND_SCALAR,
+
+				SET_FROM_VECTOR,
+
+				ZIP,
+
+				E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+
+				FOLDR_VECTOR_SCALAR_MONOID,
+
+				FOLDR_VECTOR_MASK_SCALAR_MONOID,
+
+				FOLDL_SCALAR_VECTOR_MONOID,
+
+				FOLDL_SCALAR_VECTOR_MASK_MONOID,
+
+				EWISELAMBDA,
+
+				BUILD_VECTOR,
+
+				BUILD_VECTOR_WITH_VALUES,
+
+				SIZE,
+
+				NROWS,
+
+				NCOLS,
+
+				EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+
+				EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+
+				EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+
+				EWISEAPPLY_VECTOR_VECTOR_VECTOR_OP,
+
+				EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+
+				EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+
+				EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+
+				EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+
+				EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+
+				EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+
+				EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+
+				EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+
+				EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+
+				EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+
+				EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+
+				EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+
+				EWISE_MUL_ADD,
+
+				EWISE_MUL_ADD_FOUR_VECTOR,
+
+				EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+
+				EWISE_MUL_ADD_THREE_VECTOR_CHI,
+
+				EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+
+				EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+
+				EWISE_MUL_ADD_THREE_VECTOR_BETA,
+
+				EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+
+				EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+
+				EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+
+				EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+
+				EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+
+				SET_MATRIX_MATRIX,
+
+				SET_MATRIX_MATRIX_INPUT2,
+
+				MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+
+				MXM_MATRIX_MATRIX_MATRIX_MONOID,
+
+				OUTER,
+
+				UNZIP_VECTOR_VECTOR_VECTOR,
+
+				ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+
+				ZIP_MATRIX_VECTOR_VECTOR,
+
+				CLEAR_MATRIX,
+
+				EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+
+				EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+
+				EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+
+				EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+
+				EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+
+				EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+
+				VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+
+				VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+				VXM_VECTOR_VECTOR_MATRIX_RING,
+
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+
+				MXV_VECTOR_MATRIX_VECTOR_RING,
+
+				MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+
+				BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+
+				CAPACITY_VECTOR,
+
+				CAPACITY_MATRIX,
+
+				RESIZE,
+
+				RESIZE_MATRIX,
+
+				GETID_VECTOR,
+
+				GETID_MATRIX,
+
+				EWISELAMBDA_FUNC_MATRIX,
+
+				VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+				VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+				VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+				FOLDL_VECTOR_BETA_OP,
+
+				FOLDL_VECTOR_VECTOR_BETA_OP,
+
+				FOLDL_VECTOR_BETA_MONOID,
+
+				FOLDL_VECTOR_VECTOR_BETA_MONOID,
+
+				FOLDL_VECTOR_VECTOR_MONOID,
+
+				FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+
+				FOLDL_VECTOR_VECTOR_VECTOR_OP,
+
+				FOLDL_VECTOR_VECTOR_OP,
+
+				FOLDR_APLHA_VECTOR_MONOID,
+
+				FOLDR_APLHA_VECTOR_OPERATOR,
+
+				FOLDR_VECTOR_VECTOR_OPERATOR,
+
+				FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+
+				FOLDR_VECTOR_VECTOR_MONOID,
+
+				FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+
+				EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+
+				EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+
+				EWISEMUL_VECTOR_VECTOR_BETA_RING,
+
+				EWISEMUL_VECTOR_ALPHA_BETA_RING,
+
+				EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+
+				EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+
+				EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+
+				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+
+				EWISELAMBDA_FUNC_VECTOR
+
+			};
+
+			/** \internal How many operation vertex types exist. */
+			const constexpr size_t numOperationVertexTypes = 106;
+
+			/** \internal An array of all operation vertex types. */
+			const constexpr enum OperationVertexType
+				allOperationVertexTypes[ numOperationVertexTypes ] =
+			{
+				NNZ_VECTOR,
+				NNZ_MATRIX,
+				CLEAR_VECTOR,
+				SET_VECTOR_ELEMENT,
+				DOT,
+				SET_USING_VALUE,
+				SET_USING_MASK_AND_VECTOR,
+				SET_USING_MASK_AND_SCALAR,
+				SET_FROM_VECTOR,
+				ZIP,
+				E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+				FOLDR_VECTOR_SCALAR_MONOID,
+				FOLDR_VECTOR_MASK_SCALAR_MONOID,
+				FOLDL_SCALAR_VECTOR_MONOID,
+				FOLDL_SCALAR_VECTOR_MASK_MONOID,
+				EWISELAMBDA,
+				BUILD_VECTOR,
+				BUILD_VECTOR_WITH_VALUES,
+				SIZE,
+				NROWS,
+				NCOLS,
+				EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+				EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+				EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+				EWISEAPPLY_VECTOR_VECTOR_VECTOR_OP,
+				EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+				EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+				EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+				EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+				EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+				EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+				EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+				EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+				EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+				EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+				EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+				EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+				EWISE_MUL_ADD,
+				EWISE_MUL_ADD_FOUR_VECTOR,
+				EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+				EWISE_MUL_ADD_THREE_VECTOR_CHI,
+				EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+				EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+				EWISE_MUL_ADD_THREE_VECTOR_BETA,
+				EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+				EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+				EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+				EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+				EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+				SET_MATRIX_MATRIX,
+				SET_MATRIX_MATRIX_INPUT2,
+				MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+				MXM_MATRIX_MATRIX_MATRIX_MONOID,
+				OUTER,
+				UNZIP_VECTOR_VECTOR_VECTOR,
+				ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+				ZIP_MATRIX_VECTOR_VECTOR,
+				CLEAR_MATRIX,
+				EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+				EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+				EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+				EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+				EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+				EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+				VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+				VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+				VXM_VECTOR_VECTOR_MATRIX_RING,
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+				MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+				MXV_VECTOR_MATRIX_VECTOR_RING,
+				MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+				BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+				CAPACITY_VECTOR,
+				CAPACITY_MATRIX,
+				RESIZE,
+				RESIZE_MATRIX,
+				GETID_VECTOR,
+				GETID_MATRIX,
+				EWISELAMBDA_FUNC_MATRIX,
+				VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+				VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+				VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+				FOLDL_VECTOR_BETA_OP,
+				FOLDL_VECTOR_VECTOR_BETA_OP,
+				FOLDL_VECTOR_BETA_MONOID,
+				FOLDL_VECTOR_VECTOR_BETA_MONOID,
+				FOLDL_VECTOR_VECTOR_MONOID,
+				FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+				FOLDL_VECTOR_VECTOR_VECTOR_OP,
+				FOLDL_VECTOR_VECTOR_OP,
+				FOLDR_APLHA_VECTOR_MONOID,
+				FOLDR_APLHA_VECTOR_OPERATOR,
+				FOLDR_VECTOR_VECTOR_OPERATOR,
+				FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+				FOLDR_VECTOR_VECTOR_MONOID,
+				FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+				EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+				EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+				EWISEMUL_VECTOR_VECTOR_BETA_RING,
+				EWISEMUL_VECTOR_ALPHA_BETA_RING,
+				EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+				EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+				EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+				EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+				EWISELAMBDA_FUNC_VECTOR
+			};
+
+			/** \internal @returns The operation vertex type as a string. */
+			std::string toString( const enum OperationVertexType ) noexcept;
+
+			/** \internal An operation vertex */
+			class OperationVertex {
+
+				private:
+
+					/** \internal The type of the vertex. */
+					const enum OperationVertexType type;
+
+					/** \internal The ID amongst vertices of the same type. */
+					const size_t local_id;
+
+					/** \internal The ID amongst all vertices. */
+					const size_t global_id;
+
+
+				public:
+
+					/**
+					 * \internal
+					 * Base constructor.
+					 *
+					 * @param[in] type The type of the new operation vertex.
+					 * @param[in] lid  An ID amongst vertices of the same type.
+					 * @param[in] gid  An ID unique amongst all vertices.
+					 * \endinternal
+					 */
+					OperationVertex(
+						const enum OperationVertexType type,
+						const size_t lid, const size_t gid
+					) noexcept;
+
+					/** \internal @returns The type of this vertex. */
+					enum OperationVertexType getType() const noexcept;
+
+					/**
+					 * \internal
+					 * @returns An ID unique amongst all vertices of the same type.
+					 * \endinternal
+					 */
+					size_t getLocalID() const noexcept;
+
+					/**
+					 * \internal
+					 * @returns An ID unique amongst all vertices, regardless of type.
+					 * \endinternal
+					 */
+					size_t getGlobalID() const noexcept;
+
+			};
+
+			/** \internal Helps generate operation vertices. */
+			class OperationVertexGenerator {
+
+				private:
+
+					/**
+					 * \internal
+					 * A map that keeps track of the number of vertices of each type.
+					 * \endinternal
+					 */
+					std::map< enum OperationVertexType, size_t > nextID;
+
+
+				public:
+
+					/** \internal Base constructor. */
+					OperationVertexGenerator();
+
+					/**
+					 * \internal
+					 *
+					 * @param[in] type type of the new operation vertex
+					 * @param[in] id   a unique global ID
+					 *
+					 * @returns a new output vertex with an unique local ID
+					 *
+					 * \endinternal
+					 */
+					OperationVertex create(
+						const OperationVertexType type,
+						const size_t id
+					);
+
+					/**
+					 * \internal
+					 *
+					 * @returns The total number of output vertices generated.
+					 *
+					 * \endinternal
+					 */
+					size_t size() const;
+
+			};
+
+			/**
+			 * \internal
+			 *
+			 * Encodes any directed hypergraph that may yet grow.
+			 *
+			 * \endinternal
+			 */
+			class DHypergraph {
+
+				private:
+
+					/** \internal The total number of vertices in the hypergraph. */
+					size_t num_vertices;
+
+					/**
+					 * \internal
+					 *
+					 * All hyperedges in the hypergraph.
+					 *
+					 * \endinternal
+					 */
+					std::map< size_t, std::set< size_t > > hyperedges;
+
+					/** \internal The total number of pins in the hypergraph. */
+					size_t num_pins;
+
+
+				public:
+
+					DHypergraph() noexcept;
+
+					/**
+					 * \internal
+					 *
+					 * @param[in] start The iterator over vertex IDs that need be added into
+					 *                  the hypergraph.
+					 * @param[in] end   The end iterator over the vertex IDs to be added.
+					 *
+					 * There must be at least one vertex ID added, or undefined behaviour will
+					 * occur.
+					 *
+					 * Non-unique elements in the IDs to be added will be filtered out.
+					 *
+					 * Performance is log-linear in the number of IDs to be added.
+					 * \endinternal
+					 */
+					template< typename FwdIt >
+					void appendHyperedge(
+						const size_t source,
+						FwdIt start, const FwdIt &end
+					) {
+						static_assert( std::is_unsigned<
+							typename std::iterator_traits< FwdIt >::value_type
+						>::value, "Expected an iterator over positive integral values" );
+#ifdef _DEBUG
+						std::cerr << "in appendHyperedge\n\t source " << source
+							<< "\n\t adds destinations ( ";
+						std::vector< size_t > warn;
+#endif
+						const auto it = hyperedges.find( source );
+						if( it == hyperedges.end() ) {
+							hyperedges[ source ] = std::set< size_t >();
+						}
+
+						std::set< size_t > &toAdd = hyperedges[ source ];
+						for( ; start != end; ++start ) {
+							assert( *start < num_vertices );
+							if( toAdd.find( static_cast< size_t >( *start ) ) == toAdd.end() ) {
+								toAdd.insert( *start );
+								(void) ++num_pins;
+#ifdef _DEBUG
+								std::cerr << *start << " ";
+#endif
+							} else {
+#ifdef _DEBUG
+								warn.push_back( *start );
+#endif
+							}
+						}
+#ifdef _DEBUG
+						std::cerr << ")\n";
+						if( warn.size() > 0 ) {
+							std::cerr << "\t Warning: the following edges were multiply-defined: ( ";
+							for( const auto &id : warn ) {
+								std::cerr << id << " ";
+							}
+						}
+						std::cerr << ")\n\t exiting\n";
+#endif
+					}
+
+					/**
+					 * \internal
+					 *
+					 * Creates a new vertex and returns its global ID.
+					 *
+					 * \endinternal
+					 */
+					size_t createVertex() noexcept;
+
+					/** \internal @returns The number of vertices in the current graph. */
+					size_t numVertices() const noexcept;
+
+					/** \internal @returns The number of hyperedges in the current graph. */
+					size_t numHyperedges() const noexcept;
+
+					/** \internal @returns The total number of pins in the current graph. */
+					size_t numPins() const noexcept;
+
+					/**
+					 * \internal
+					 *
+					 * Prints the hypergraph to a given output stream as a series of
+					 * hyperedges. The output format is MatrixMarket-like, where every
+					 * hyperedge is assigned a unique ID, and every hyperedge-to-vertex pair
+					 * then is printed to \a out.
+					 *
+					 * @param[in,out] out Where to print the hypergraph to.
+					 *
+					 * \endinternal
+					 */
+					void render( std::ostream &out ) const;
+
+			};
+
+			/** \internal Represents a finalised HyperDAG */
+			class HyperDAG {
+
+				friend class HyperDAGGenerator;
+
+				private:
+
+					/** \internal The underlying hypergraph. */
+					DHypergraph hypergraph;
+
+					/** \internal The number of source vertices. */
+					size_t num_sources;
+
+					/** \internal The number of operation vertices. */
+					size_t num_operations;
+
+					/** \internal The number of output vertices. */
+					size_t num_outputs;
+
+					/** \internal A vector of source vertices. */
+					std::vector< SourceVertex > sourceVertices;
+
+					/** \internal A vector of operation vertices. */
+					std::vector< OperationVertex > operationVertices;
+
+					/** \internal A vector of output vertices. */
+					std::vector< OutputVertex > outputVertices;
+
+					/** \internal A map from source vertex IDs to global IDs. */
+					std::map< size_t, size_t > source_to_global_id;
+
+					/** \internal A map from operation vertex IDs to global IDs. */
+					std::map< size_t, size_t > operation_to_global_id;
+
+					/** \internal A map from output vertex IDs to global IDs. */
+					std::map< size_t, size_t > output_to_global_id;
+
+					/** \internal A map from global IDs to their types. */
+					std::map< size_t, enum VertexType > global_to_type;
+
+					/** \internal A map from global IDs to their local IDs. */
+					std::map< size_t, size_t > global_to_local_id;
+
+					/**
+					 * \internal
+					 *
+					 * Base constructor.
+					 *
+					 * @param[in] _hypergraph The base hypergraph.
+					 * @param[in] _srcVec     Vector of source vertices.
+					 * @param[in] _opVec      Vector of operation vertices.
+					 * @param[in] _outVec     Vector of output vertices.
+					 */
+					HyperDAG(
+						DHypergraph _hypergraph,
+						const std::vector< SourceVertex > &_srcVec,
+						const std::vector< OperationVertex > &_opVec,
+						const std::vector< OutputVertex > &_outVec
+					);
+
+
+				public:
+
+
+					/** @returns The hypergraph representation of the HyperDAG. */
+					DHypergraph get() const noexcept;
+
+					/** @returns The number of source vertices. */
+					size_t numSources() const noexcept;
+
+					/** @returns The number of operation vertices. */
+					size_t numOperations() const noexcept;
+
+					/** @returns The number of output vertices. */
+					size_t numOutputs() const noexcept;
+
+					/** @returns A start iterator to the source vertices. */
+					std::vector< SourceVertex >::const_iterator sourcesBegin() const;
+
+					/** @returns End iterator matching #sourcesBegin(). */
+					std::vector< SourceVertex >::const_iterator sourcesEnd() const;
+
+					/** @returns A start iterator to the output vertices. */
+					std::vector< OperationVertex >::const_iterator operationsBegin() const;
+
+					/** @returns End iterator matching #outputsBegin. */
+					std::vector< OperationVertex >::const_iterator operationsEnd() const;
+
+					/** @returns A start iterator to the output vertices. */
+					std::vector< OutputVertex >::const_iterator outputsBegin() const;
+
+					/** @returns End iterator matching #outputsBegin. */
+					std::vector< OutputVertex >::const_iterator outputsEnd() const;
+
+			};
+
+			/** \internal Builds a HyperDAG representation of an ongoing computation. */
+			class HyperDAGGenerator {
+
+				private:
+
+					/** \internal The hypergraph under construction. */
+					DHypergraph hypergraph;
+
+					/**
+					 * \internal
+					 *
+					 * Once new source vertices are created, they are recorded here. This
+					 * storage differs from #sourceVertices in that the latter only keeps
+					 * track of currently active source vertices, and identifies them by
+					 * a pointer.
+					 *
+					 * \endinternal
+					 */
+					std::vector< SourceVertex > sourceVec;
+
+					/**
+					 * \internal
+					 *
+					 * Once new operation vertices are created, they are recorded here. This
+					 * storage differs from #operationVertices in that the latter only keeps
+					 * track of currently active source vertices, and identifies them by
+					 * a pointer.
+					 *
+					 * \endinternal
+					 */
+					std::vector< OperationVertex > operationVec;
+
+					/** \internal Map of pointers to source vertices. */
+					std::map< const void *, SourceVertex > sourceVerticesP;
+
+					/** \internal Map of IDs to source vertices. */
+					std::map< uintptr_t, SourceVertex > sourceVerticesC;
+
+					/** \internal Map of IDs to operation vertices. */
+					std::map< uintptr_t, OperationVertex > operationVertices;
+
+					// note: there is no map of OutputVertices because only at the point we
+					//       finalize to generate the final HyperDAG do we know for sure what
+					//       the output vertices are. The same applies to an `outputVec`.
+
+					/**
+					 * \internal
+					 *
+					 * During a computation, once an operation executes, its output container
+					 * may be an intermediate result or an output. For as long as it is unknown
+					 * which it is, those pointers are registered here. Each vertex here must
+					 * be assigned a global ID, which are stored as values in this map.
+					 *
+					 * \endinternal
+					 */
+					std::map< uintptr_t,
+						std::pair< size_t, OperationVertexType >
+					> operationOrOutputVertices;
+
+					/** \internal Source vertex generator. */
+					SourceVertexGenerator sourceGen;
+
+					/** \internal Operation vertex generator. */
+					OperationVertexGenerator operationGen;
+
+					// OutputVertexGenerator is a local field of #finalize()
+
+					/**
+					 * \internal
+					 * Adds a source vertex to the hypergraph.
+					 *
+					 * @param[in] type    The type of source vertex.
+					 * @param[in] pointer A unique identifier of the source.
+					 * @param[in] id      A unique identifier of the source.
+					 *
+					 * If the \a type corresponds to an ALP/GraphBLAS container, then
+					 * \a pointer is ignored; otherwise, \a id is ignored.
+					 * \endinternal
+					 */
+					size_t addAnySource(
+						const SourceVertexType type,
+						const void * const pointer,
+						const uintptr_t id
+					);
+
+
+				public:
+
+					/**
+					 * \internal Base constructor.
+					 */
+					HyperDAGGenerator() noexcept;
+
+					/**
+					 * \internal
+					 *
+					 * Sometimes a given \em operation generates a source vertex-- for example,
+					 * the scalar input/output argument to grb::dot.
+					 *
+					 * In such cases, this function should be called to register the source
+					 * vertex.
+					 *
+					 * @param[in] type    The type of source vertex
+					 * @param[in] pointer A unique identifier corresponding to the source
+					 *
+					 * \warning \a type cannot be #SourceVertexType::CONTAINER-- such source
+					 *          vertices should be automatically resolved via #addOperation.
+					 *
+					 * \endinternal
+					 */
+					void addSource(
+						const SourceVertexType type,
+						const void * const pointer
+					);
+
+					/**
+					 * \internal
+					 *
+					 * Registers a new source container with a given \a id.
+					 *
+					 * \endinternal
+					 */
+					void addContainer( const uintptr_t id );
+
+					/**
+					 * \internal
+					 *
+					 * Registers a new operation with the HyperDAG.
+					 *
+					 * @param[in] type The type of operation being registered.
+					 * @param[in] src_start, src_end Iterators to a set of source pointers.
+					 * @param[in] dst_start, dst_end Iterators to a set of destination pointers.
+					 *
+					 * This function proceeds as follows:
+					 *    1. for source pointers in #operationOrOutputVertices, a) upgrade them
+					 *       to #OperationVertex, and b) add them to #operationVertices. For
+					 *       source pointers in #operationVertices, do nothing.
+					 *    2. for remaining source pointers that are not in #sourceVertices,
+					 *       upgrade them to #SourceVertex and add them to #sourceVertices.
+					 *       Otherwise, if already a source, add it from #sourceVertices
+					 *       directly.
+					 *    3. for every source pointer k, build an hyperedge. Each hyperedge
+					 *       contains only one entry at this point, namely the global ID
+					 *       corresponding to each of the k source pointers.
+					 *    4. if destination pointers already existed within this HyperDAG, the
+					 *       current operation does not correspond to the same ones-- we need
+					 *       to create new ones for them. Therefore, we first remove old
+					 *       copies. Note that destinations that also dubbed as sources are now
+					 *       safe to remove, because we already processed the source pointers.
+					 *    5. Assign all destination pointers a new global ID, and add them to
+					 *       #operationOrOutputVertices.
+					 *    6. Assign all these new global IDs to each of the k hyperedges that
+					 *       step 3 started to construct. Thus if there are l destination,
+					 *       pointers, we now have k hyperedges with l+1 entries each.
+					 *    7. Store those k hyperedges and exit.
+					 *
+					 * \warning For in-place operations, the output container must be given
+					 *          both as a source \em and destination pointer.
+					 *
+					 * \endinternal
+					 */
+					template< typename SrcPIt, typename SrcCIt, typename DstIt >
+					void addOperation(
+						const OperationVertexType type,
+						SrcPIt src_p_start, const SrcPIt &src_p_end,
+						SrcCIt src_c_start, const SrcCIt &src_c_end,
+						DstIt dst_start, const DstIt &dst_end
+					) {
+						static_assert( std::is_same< const void *,
+								typename std::iterator_traits< SrcPIt >::value_type
+							>::value,
+							"Source pointers should be given as const void pointers"
+						);
+						static_assert( std::is_same< uintptr_t,
+								typename std::iterator_traits< DstIt >::value_type
+							>::value,
+							"Destinations should be identified by their IDs"
+						);
+						static_assert( std::is_same< uintptr_t,
+								typename std::iterator_traits< SrcCIt >::value_type
+							>::value,
+							"Source containers should be identified by their IDs"
+						);
+
+#ifdef _DEBUG
+						std::cerr << "In HyperDAGGen::addOperation( "
+							<< toString( type ) << ", ... )\n"
+							<< "\t sourceVertices size: " << sourceVerticesP.size() << " pointers + "
+							<< sourceVerticesC.size() << " containers\n"
+							<< "\t sourceVec size: " << sourceVec.size() << "\n";
+#endif
+
+						// steps 1, 2, and 3
+						std::vector< std::pair< size_t, std::set< size_t > > > hyperedges;
+						for( ; src_p_start != src_p_end; ++src_p_start ) {
+#ifdef _DEBUG
+							std::cerr << "\t processing source pointer " << *src_p_start << "\n";
+#endif
+							// source pointers (input scalars, not input containers) are simple--
+							// they will never appear as operation vertices, nor as output vertices.
+							// Therefore step 1 does not apply.
+
+							// step 2
+							size_t sourceID;
+							const auto alreadySource = sourceVerticesP.find( *src_p_start );
+							if( alreadySource == sourceVerticesP.end() ) {
+#ifndef NDEBUG
+								const bool all_sources_should_already_be_added = false;
+								assert( all_sources_should_already_be_added );
+#endif
+								std::cerr << "Warning: unidentified source " << *src_p_start << ". "
+									<< "Adding it as an input scalar.\n";
+								sourceID = addAnySource( SCALAR, *src_p_start, 0 );
+							} else {
+#ifdef _DEBUG
+								std::cerr << "\t found source in sourceVertices\n";
+#endif
+								sourceID = alreadySource->second.getGlobalID();
+							}
+							// step 3
+							hyperedges.push_back( std::make_pair( sourceID, std::set< size_t >() ) );
+						}
+						for( ; src_c_start != src_c_end; ++src_c_start ) {
+#ifdef _DEBUG
+							std::cerr << "\t processing source container " << *src_c_start << "\n";
+#endif
+							// step 1
+							size_t sourceID;
+							const auto &it = operationOrOutputVertices.find( *src_c_start );
+							const auto &it2 = operationVertices.find( *src_c_start );
+							if( it2 != operationVertices.end() ) {
+								// operation vertices are fine as a source -- no additional operations
+								// necessary
+								assert( it == operationOrOutputVertices.end() );
+#ifdef _DEBUG
+								std::cerr << "\t source was previously an operation\n";
+#endif
+								sourceID = it2->second.getGlobalID();
+							} else if( it == operationOrOutputVertices.end() ) {
+								// step 2
+								const auto alreadySource = sourceVerticesC.find( *src_c_start );
+								if( alreadySource == sourceVerticesC.end() ) {
+#ifndef NDEBUG
+									const bool all_sources_should_already_be_added = false;
+									assert( all_sources_should_already_be_added );
+#endif
+									std::cerr << "Warning: unidentified source " << *src_c_start << ". "
+										<< "Adding it as a container.\n";
+									sourceID = addAnySource( CONTAINER, nullptr, *src_c_start );
+								} else {
+#ifdef _DEBUG
+									std::cerr << "\t found source in sourceVertices\n";
+#endif
+									sourceID = alreadySource->second.getGlobalID();
+								}
+							} else {
+#ifdef _DEBUG
+								std::cerr << "\t found source in operationOrOutputVertices\n";
+#endif
+								// step 2
+								const auto &remove = operationVertices.find( it->first );
+								if( remove != operationVertices.end() ) {
+#ifdef _DEBUG
+									std::cerr << "\t found source in operationVertices; removing it\n";
+#endif
+									operationVertices.erase( remove );
+								}
+#ifdef _DEBUG
+								std::cerr << "\t creating new entry in operationOrOutputVertices\n";
+#endif
+								const size_t global_id = it->second.first;
+								const auto &operationVertex = operationGen.create(
+									it->second.second, global_id
+								);
+								operationVertices.insert( std::make_pair( it->first, operationVertex ) );
+								operationVec.push_back( operationVertex );
+								operationOrOutputVertices.erase( it );
+								sourceID = global_id;
+							}
+							// step 3
+							hyperedges.push_back( std::make_pair( sourceID, std::set< size_t >() ) );
+						}
+
+
+						// step 4, 5, and 6
+						for( ; dst_start != dst_end; ++dst_start ) {
+#ifdef _DEBUG
+							std::cerr << "\t processing destination " << *dst_start << "\n";
+#endif
+							// step 4
+							{
+								const auto &it = sourceVerticesC.find( *dst_start );
+								if( it != sourceVerticesC.end() ) {
+#ifdef _DEBUG
+									std::cerr << "\t destination found in sources-- "
+										<< "removing it from there\n";
+#endif
+									sourceVerticesC.erase( it );
+								}
+							}
+							{
+								const auto &it = operationVertices.find( *dst_start );
+								if( it != operationVertices.end() ) {
+#ifdef _DEBUG
+									std::cerr << "\t destination found in operations-- "
+										<< "removing it from there\n";
+#endif
+									operationVertices.erase( it );
+								}
+							}
+							{
+								const auto &it = operationOrOutputVertices.find( *dst_start );
+								if( it != operationOrOutputVertices.end() ) {
+									std::cerr << "WARNING (hyperdags::addOperation): an unconsumed output "
+										<< "container was detected. This indicates the existance of "
+										<< "an ALP primitive whose output is never used.\n";
+#ifdef _DEBUG
+									std::cerr << "\t destination found in operationsOrOutput-- "
+										<< "removing it from there\n";
+#endif
+									operationOrOutputVertices.erase( it );
+								}
+							}
+							// step 5
+							const size_t global_id = hypergraph.createVertex();
+							operationOrOutputVertices.insert(
+								std::make_pair( *dst_start,
+									std::make_pair( global_id, type )
+								)
+							);
+#ifdef _DEBUG
+							std::cerr << "\t created a new operation vertex with global ID "
+								<< global_id << "\n";
+#endif
+							// step 6
+							for( auto &hyperedge : hyperedges ) {
+								hyperedge.second.insert( global_id );
+							}
+						}
+
+						// step 7
+						for( const auto &hyperedge : hyperedges ) {
+#ifdef _DEBUG
+							std::cerr << "\t storing a hyperedge of size "
+								<< (hyperedge.second.size()+1) << "\n";
+#endif
+							hypergraph.appendHyperedge(
+								hyperedge.first,
+								hyperedge.second.begin(), hyperedge.second.end()
+							);
+						}
+					}
+
+					/**
+					 * \internal
+					 *
+					 * Assumes that all remaining vertices in #operationVertexOrOutputVertex
+					 * are of type #OutputVertex. It then generates a finalised HyperDAG.
+					 *
+					 * @returns The resulting HyperDAG.
+					 *
+					 * The current generator instance is left unmodified; this function takes
+					 * a snapshot of the current state, and allows its further extension.
+					 *
+					 * \endinternal
+					 */
+					HyperDAG finalize() const;
+
+			};
+
+		} // end namespace grb::internal::hyperdags
+
+	} // end namespace grb::internal
+
+} // end namespace grb
+
+#endif // end _H_GRB_HYPERDAGS_STATE
+
diff --git a/include/graphblas/hyperdags/init.hpp b/include/graphblas/hyperdags/init.hpp
new file mode 100644
index 000000000..4afbb3765
--- /dev/null
+++ b/include/graphblas/hyperdags/init.hpp
@@ -0,0 +1,55 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides initialisers for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_INIT
+#define _H_GRB_HYPERDAGS_INIT
+
+#include <graphblas/hyperdags/hyperdags.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		namespace hyperdags {
+
+			/** Singleton generator instance. */
+			extern HyperDAGGenerator generator;
+
+		}
+
+	}
+
+	template<>
+	RC init< hyperdags >( const size_t, const size_t, void * const );
+
+	template<>
+	RC finalize< hyperdags >();
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
new file mode 100644
index 000000000..e68af3eb7
--- /dev/null
+++ b/include/graphblas/hyperdags/io.hpp
@@ -0,0 +1,562 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the I/O primitives for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March 2022
+ */
+
+#include <graphblas/config.hpp>
+
+#include <array>
+
+
+namespace grb {
+
+	// input:
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename fwd_iterator, typename Coords,
+		class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, hyperdags, Coords > &x,
+		fwd_iterator start, const fwd_iterator end,
+		const IOMode mode, const Dup &dup = Dup()
+	) {
+		const RC ret = buildVector<descr>(
+			internal::getVector(x), start, end, mode, dup
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::ITERATOR,
+			&start
+		);
+		std::array< const void *, 1 > sourcesP{ &start };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::BUILD_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename fwd_iterator1, typename fwd_iterator2,
+		typename Coords, class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, hyperdags, Coords > &x,
+		fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
+		fwd_iterator2 val_start, const fwd_iterator2 val_end,
+		const IOMode mode,
+		const Dup &dup = Dup()
+	) {
+		const RC ret = buildVector< descr >(
+			internal::getVector(x), ind_start, ind_end, val_start, val_end, mode, dup
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::ITERATOR,
+			&ind_start
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::ITERATOR,
+			&val_start
+		);
+		std::array< const void *, 2 > sourcesP{ &ind_start, &val_start };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::BUILD_VECTOR_WITH_VALUES,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType, typename fwd_iterator
+	>
+	RC buildMatrixUnique(
+		Matrix< InputType, hyperdags > &A,
+		fwd_iterator start,
+		const fwd_iterator end,
+		const IOMode mode
+	) {
+		const RC ret = buildMatrixUnique< descr >(
+			internal::getMatrix(A), start, end, mode
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( ncols( A ) == 0 || nrows( A ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::ITERATOR,
+			&start
+		);
+		std::array< const void *, 1 > sourcesP{ &start };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename T, typename Coords
+	>
+	RC setElement(
+		Vector< DataType, hyperdags, Coords > &x,
+		const T val,
+		const size_t i,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = setElement< descr >(
+			internal::getVector( x ), val, i, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		// x cannot be empty here or setElement would have failed-- no need to catch
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&val
+		);
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::USER_INT,
+			&i
+		);
+		std::array< const void *, 2 > sourcesP{ &val, &i };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_VECTOR_ELEMENT,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename Coords,
+		typename T
+	>
+	RC set(
+		Vector< DataType, hyperdags, Coords > &x, const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+		const RC ret = set< descr >( internal::getVector( x ), val, phase );
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		if( !(descr & descriptors::use_index) ) {
+			internal::hyperdags::generator.addSource(
+				internal::hyperdags::SCALAR,
+				&val
+			);
+			std::array< const void *, 1 > sourcesP{ &val };
+			std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+			std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_USING_VALUE,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		} else {
+			std::array< const void *, 0 > sourcesP{};
+			std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+			std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+			internal::hyperdags::generator.addOperation(
+				internal::hyperdags::SET_USING_VALUE,
+				sourcesP.begin(), sourcesP.end(),
+				sourcesC.begin(), sourcesC.end(),
+				destinations.begin(), destinations.end()
+			);
+		}
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType, typename MaskType, typename T,
+		typename Coords
+	>
+	RC set(
+		Vector< DataType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &m,
+		const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( m ) == 0 ) { return set< descr >( x, val, phase ); }
+		const RC ret = set< descr >(
+			internal::getVector(x), internal::getVector(m),
+			val, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&val
+		);
+		std::array< const void *, 1 > sourcesP{ &val };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(x) ),
+			getID( internal::getVector(m) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_USING_MASK_AND_SCALAR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename MaskType, typename InputType,
+		typename Coords
+	>
+	RC set(
+		Vector< OutputType, hyperdags, Coords > &x,
+		const Vector< MaskType, hyperdags, Coords > &mask,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * const = nullptr
+	) {
+		if( size( mask ) == 0 ) { return set< descr >( x, y, phase ); }
+		const RC ret = set< descr >(
+			internal::getVector(x),
+			internal::getVector(mask), internal::getVector(y),
+			phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 3 > sourcesC{
+			getID( internal::getVector(mask) ),
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(x) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_USING_MASK_AND_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType, typename Coords
+	>
+	RC set(
+		Vector< OutputType, hyperdags, Coords > &x,
+		const Vector< InputType, hyperdags, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = set< descr >(
+			internal::getVector(x), internal::getVector(y), phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getVector(y) ),
+			getID( internal::getVector(x) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_FROM_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC set(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = set< descr >(
+			internal::getMatrix( C ), internal::getMatrix( A ), phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( C ) == 0 || ncols( C ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_MATRIX_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC set(
+		Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+		const InputType2 &val,
+		const Phase &phase = EXECUTE
+	) {
+		const RC ret = set< descr >(
+			internal::getMatrix( C ), internal::getMatrix( A ),
+			val, phase
+		);
+		if( ret != SUCCESS ) { return ret; }
+		if( phase != EXECUTE ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::SCALAR,
+			&val
+		);
+		std::array< const void *, 1 > sourcesP{ &val };
+		std::array< uintptr_t, 2 > sourcesC{
+			getID( internal::getMatrix(A) ),
+			getID( internal::getMatrix(C) )
+		};
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::SET_MATRIX_MATRIX_INPUT2,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template< typename DataType, typename Coords >
+	RC clear( Vector< DataType, hyperdags, Coords > &x ) {
+		const RC ret = clear( internal::getVector( x ) );
+		if( ret != SUCCESS ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::CLEAR_VECTOR,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC clear( Matrix< InputType, hyperdags, RIT, CIT, NIT > &A ) noexcept {
+		const RC ret = clear( internal::getMatrix(A) );
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		std::array< const void *, 0 > sourcesP{};
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::CLEAR_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	// getters:
+
+	template< typename DataType, typename Coords >
+	size_t size( const Vector< DataType, hyperdags, Coords > &x ) {
+		return size (internal::getVector(x));
+	}
+
+	template< typename InputType >
+	size_t nrows( const Matrix< InputType, hyperdags > &A ) noexcept {
+		return nrows(internal::getMatrix(A));
+	}
+
+	template< typename InputType >
+	size_t ncols( const Matrix< InputType, hyperdags > &A ) noexcept {
+		return ncols(internal::getMatrix(A));
+	}
+
+	template< typename DataType, typename Coords >
+	size_t capacity( const Vector< DataType, hyperdags, Coords > &x ) noexcept {
+		return capacity(internal::getVector( x ));
+	}
+
+	template< typename DataType >
+	size_t capacity( const Matrix< DataType, hyperdags > &A ) noexcept {
+		return capacity(internal::getMatrix( A ));
+	}
+
+	template< typename DataType, typename Coords >
+	size_t nnz( const Vector< DataType, hyperdags, Coords > &x ) noexcept {
+		return nnz( internal::getVector( x ) );
+	}
+
+	template< typename InputType >
+	size_t nnz( const Matrix< InputType, hyperdags > &A ) noexcept {
+		return nnz(internal::getMatrix(A));
+	}
+
+	template< typename InputType, typename Coords >
+	uintptr_t getID( const Vector< InputType, hyperdags, Coords > &x ) {
+		return getID(internal::getVector( x ));
+	}
+
+	template< typename InputType >
+	uintptr_t getID( const Matrix< InputType, hyperdags > &A ) {
+		return getID(internal::getMatrix( A ));
+	}
+
+	// resizers:
+
+	template< typename InputType, typename Coords >
+	RC resize(
+		Vector< InputType, hyperdags, Coords > &x,
+		const size_t new_nz
+	) noexcept {
+		const RC ret = resize( internal::getVector( x ), new_nz );
+		if( ret != SUCCESS ) { return ret; }
+		if( size( x ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::USER_INT,
+			&new_nz
+		);
+		std::array< const void *, 1 > sourcesP{ &new_nz };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::RESIZE,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	template< typename InputType >
+	RC resize(
+		Matrix< InputType, hyperdags > &A,
+		const size_t new_nz
+	) noexcept {
+		const RC ret = resize( internal::getMatrix(A), new_nz );
+		if( ret != SUCCESS ) { return ret; }
+		if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+		internal::hyperdags::generator.addSource(
+			internal::hyperdags::USER_INT,
+			&new_nz
+		);
+		std::array< const void *, 1 > sourcesP{ &new_nz };
+		std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+		std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+		internal::hyperdags::generator.addOperation(
+			internal::hyperdags::RESIZE_MATRIX,
+			sourcesP.begin(), sourcesP.end(),
+			sourcesC.begin(), sourcesC.end(),
+			destinations.begin(), destinations.end()
+		);
+		return ret;
+	}
+
+	// nonblocking I/O:
+
+	template<>
+	RC wait< hyperdags >();
+
+	/** \internal Dispatch to base wait implementation */
+	template<
+		typename InputType, typename Coords,
+		typename ... Args
+	>
+	RC wait(
+		const Vector< InputType, hyperdags, Coords > &x,
+		const Args &... args
+	) {
+		(void) x;
+		return wait( args... );
+	}
+
+	/** \internal Dispatch to base wait implementation */
+	template< typename InputType, typename... Args >
+	RC wait(
+		const Matrix< InputType, hyperdags > &A,
+		const Args &... args
+	) {
+		(void) A;
+		return wait( args... );
+	}
+
+} // namespace grb
+
diff --git a/include/graphblas/hyperdags/matrix.hpp b/include/graphblas/hyperdags/matrix.hpp
new file mode 100644
index 000000000..a80602bb6
--- /dev/null
+++ b/include/graphblas/hyperdags/matrix.hpp
@@ -0,0 +1,286 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the matrix container for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_MATRIX
+#define _H_GRB_HYPERDAGS_MATRIX
+
+#include <graphblas/config.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+			Matrix< T, grb::hyperdags, RIT, CIT, NIT > &
+		);
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		const Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+			const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+		);
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage<
+			T, RIT, NIT
+		> & getCRS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage<
+			T, RIT, NIT
+		> & getCRS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage<
+			T, CIT, NIT
+		> & getCCS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage<
+			T, CIT, NIT
+		> & getCCS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+	}
+
+	template< typename T, typename RIT, typename CIT, typename NIT >
+	class Matrix< T, hyperdags, RIT, CIT, NIT > {
+
+		template< typename A, typename sRIT, typename sCIT, typename sNIT >
+		friend Matrix<
+			A, _GRB_WITH_HYPERDAGS_USING, sRIT, sCIT, sNIT
+		> & internal::getMatrix(
+			Matrix< A, grb::hyperdags, sRIT, sCIT, sNIT > &
+		);
+
+		template< typename A, typename sRIT, typename sCIT, typename sNIT >
+		friend const Matrix<
+			A, _GRB_WITH_HYPERDAGS_USING, sRIT, sCIT, sNIT
+		> & internal::getMatrix(
+			const Matrix< A, grb::hyperdags, sRIT, sCIT, sNIT > &
+		);
+
+
+		private:
+
+			/** \internal My own type */
+			typedef Matrix< T, hyperdags, RIT, CIT, NIT > SelfType;
+
+			/** \internal Simply use an underlying implementation */
+			typedef Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > MyMatrixType;
+
+			/** \internal Underlying matrix */
+			MyMatrixType matrix;
+
+			/** \internal Register this matrices */
+			void register_matrix() {
+#ifdef _DEBUG
+				std::cout << "\t registering matrix with pointer " << this << "\n";
+#endif
+				if( nrows( matrix ) > 0 && ncols( matrix ) > 0 ) {
+					internal::hyperdags::generator.addContainer( getID( matrix ) );
+				}
+			}
+
+
+		public:
+
+			/** \internal Base constructor, no capacity */
+			Matrix( const size_t rows, const size_t columns ) :
+				matrix( rows, columns )
+			{
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) constructor\n";
+#endif
+				register_matrix();
+			}
+
+			/** \internal Base constructor with capacity */
+			Matrix( const size_t rows, const size_t columns, const size_t nz ) :
+				matrix( rows, columns, nz )
+			{
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) capacity constructor\n";
+#endif
+				register_matrix();
+			}
+
+			/** \internal Copy constructor */
+			Matrix( const SelfType &x ) : matrix( x.matrix ) {
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) copy constructor\n";
+#endif
+				register_matrix();
+			}
+
+			/** \internal Move constructor */
+			Matrix( SelfType &&x ) {
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) move constructor\n";
+#endif
+				matrix = std::move( x.matrix );
+				register_matrix();
+			}
+
+			~Matrix() {
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) destructor\n";
+#endif
+			}
+
+			/** \internal Copy-assignment */
+			SelfType& operator=( const SelfType &x ) {
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) copy assignment\n";
+#endif
+				matrix = x.matrix;
+				return *this;
+			}
+
+			/** \internal Move-assignment */
+			SelfType& operator=( SelfType &&x ) {
+#ifdef _DEBUG
+				std::cout << "Matrix (hyperdags) move assignment\n";
+#endif
+				matrix = std::move( x.matrix );
+				return *this;
+			}
+
+			/** \internal Start const-iterator */
+			template<
+				class ActiveDistribution = internal::Distribution<
+					_GRB_WITH_HYPERDAGS_USING
+				>
+			>
+			typename internal::Compressed_Storage<
+				T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > begin(
+				const IOMode mode = PARALLEL, const size_t s = 0, const size_t P = 1
+			) const {
+				return matrix.begin( mode, s, P );
+			}
+
+			/** \internal Matching end-iterator to begin */
+			template<
+				class ActiveDistribution = internal::Distribution<
+					_GRB_WITH_HYPERDAGS_USING
+				>
+			>
+			typename internal::Compressed_Storage<
+				T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > end(
+				const IOMode mode = PARALLEL, const size_t s = 0, const size_t P = 1
+			) const {
+				return matrix.end(mode, s, P);
+			}
+
+			/** \internal Start const-iterator */
+			template<
+				class ActiveDistribution = internal::Distribution<
+					_GRB_WITH_HYPERDAGS_USING
+				>
+			>
+			typename internal::Compressed_Storage<
+				T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cbegin(
+				const IOMode mode = PARALLEL
+			) const {
+				return matrix.cbegin(mode);
+			}
+
+			/** \internal Matching end iterator to cbegin */
+			template<
+				class ActiveDistribution = internal::Distribution<
+					_GRB_WITH_HYPERDAGS_USING
+				>
+			>
+			typename internal::Compressed_Storage<
+				T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cend(
+				const IOMode mode = PARALLEL
+			) const {
+				return matrix.cend(mode);
+			}
+
+	};
+
+	/** \internal Basic type trait for matrices */
+	template< typename D, typename RIT, typename CIT, typename NIT >
+	struct is_container< Matrix< D, hyperdags, RIT, CIT, NIT > > {
+		/** A hyperdags matrix is an ALP container. */
+		static const constexpr bool value = true;
+	};
+
+	namespace internal {
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+			Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+		) {
+			return x.matrix;
+		}
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		const Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+			const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+		) {
+			return x.matrix;
+		}
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage<
+			T, RIT, NIT
+		> & getCRS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+			return getCRS( internal::getMatrix( A ) );
+		}
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage<
+			T, RIT, NIT
+		> & getCRS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+			return getCRS( internal::getMatrix(A) );
+		}
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage<
+			T, CIT, NIT
+		> & getCCS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+			return getCCS( internal::getMatrix(A) );
+		}
+
+		template< typename T, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage<
+			T, CIT, NIT
+		> & getCCS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+			return getCCS( internal::getMatrix(A) );
+		}
+
+	} // end ``grb::internal''
+
+}
+
+#endif // end ``_H_GRB_HYPERDAGS_MATRIX''
+
diff --git a/include/graphblas/hyperdags/pinnedvector.hpp b/include/graphblas/hyperdags/pinnedvector.hpp
new file mode 100644
index 000000000..184a4987e
--- /dev/null
+++ b/include/graphblas/hyperdags/pinnedvector.hpp
@@ -0,0 +1,103 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Contains the hyperdags implementations for the PinnedVector class
+ *
+ * @author A. Karanasiou
+ * @date August 17, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_PINNEDVECTOR
+#define _H_GRB_HYPERDAGS_PINNEDVECTOR
+
+#include <graphblas/base/pinnedvector.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+
+#include "vector.hpp"
+
+
+namespace grb {
+
+	/** \internal No implementation notes. */
+	template< typename IOType >
+	class PinnedVector< IOType, hyperdags > {
+
+		private:
+
+			/** This implementation relies on the sub-backend. */
+			typedef PinnedVector< IOType, grb::_GRB_WITH_HYPERDAGS_USING >
+				MyPinnedVector;
+
+			/** Instance of the underlying backend. */
+			MyPinnedVector pinned_vector;
+
+
+		public:
+
+			/** \internal No implementation notes. */
+			PinnedVector() : pinned_vector() {}
+
+			/** \internal No implementation notes. */
+			PinnedVector(
+				const Vector< IOType, hyperdags, internal::hyperdags::Coordinates > &x,
+				const IOMode mode
+			): pinned_vector( internal::getVector(x), mode ) {};
+
+			// default destructor is allowed
+
+			/** \internal No implementation notes. */
+			inline size_t size() const noexcept {
+				return pinned_vector.size();
+			}
+
+			/** \internal No implementation notes. */
+			inline size_t nonzeroes() const noexcept {
+				return pinned_vector.nonzeroes();
+			}
+
+			/** \internal No implementation notes. */
+			template< typename OutputType = IOType >
+			inline OutputType getNonzeroValue(
+				const size_t k,
+				const OutputType one
+			) const noexcept {
+				return pinned_vector.getNonzeroValue( k, one );
+			}
+
+			/** \internal No implementation notes. */
+			inline IOType getNonzeroValue(
+				const size_t k
+			) const noexcept {
+				return pinned_vector.getNonzeroValue( k );
+			}
+
+			/** \internal No implementation notes. */
+			inline size_t getNonzeroIndex(
+				const size_t k
+			) const noexcept {
+				return pinned_vector.getNonzeroIndex( k );
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_PINNEDVECTOR''
+
diff --git a/include/graphblas/hyperdags/properties.hpp b/include/graphblas/hyperdags/properties.hpp
new file mode 100644
index 000000000..ce5f239d2
--- /dev/null
+++ b/include/graphblas/hyperdags/properties.hpp
@@ -0,0 +1,56 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collects the hyperdags backend properties
+ *
+ * @author A. N. Yzelman
+ * @date 23rd of March, 2023
+ */
+
+#ifndef _H_GRB_HYPERDAGS_PROPERTIES
+#define _H_GRB_HYPERDAGS_PROPERTIES
+
+#include <graphblas/base/properties.hpp>
+#include <graphblas/hyperdags/config.hpp>
+
+
+namespace grb {
+
+	/** All properties are inherited from the underlying backend. */
+	template<>
+	class Properties< hyperdags > {
+
+		public:
+
+			static constexpr const bool writableCaptured =
+				Properties< _GRB_WITH_HYPERDAGS_USING >::writableCaptured;
+
+			static constexpr const bool isBlockingExecution =
+				Properties< _GRB_WITH_HYPERDAGS_USING >::isBlockingExecution;
+
+			static constexpr const bool isNonblockingExecution =
+				Properties< _GRB_WITH_HYPERDAGS_USING >::isNonblockingExecution;
+
+	};
+
+} // namespace grb
+
+#endif // end `_H_GRB_HYPERDAGS_PROPERTIES
+
diff --git a/include/graphblas/hyperdags/spmd.hpp b/include/graphblas/hyperdags/spmd.hpp
new file mode 100644
index 000000000..270d7967c
--- /dev/null
+++ b/include/graphblas/hyperdags/spmd.hpp
@@ -0,0 +1,59 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the SPMD API for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 15th of March 2022
+ */
+
+#include <cstddef> //size_t
+
+#include <graphblas/base/spmd.hpp>
+
+namespace grb {
+
+	template<>
+	class spmd< hyperdags > {
+
+		public:
+
+			static inline size_t nprocs() noexcept {
+				return spmd< _GRB_WITH_HYPERDAGS_USING >::nprocs();
+			}
+
+			static inline size_t pid() noexcept {
+				return spmd< _GRB_WITH_HYPERDAGS_USING >::pid();
+			}
+
+			static RC sync(
+				const size_t msgs_in = 0, const size_t msgs_out = 0
+			) noexcept {
+				return spmd< _GRB_WITH_HYPERDAGS_USING >::sync( msgs_in, msgs_out );
+			}
+
+			static RC barrier() noexcept {
+				return spmd< _GRB_WITH_HYPERDAGS_USING >::barrier();
+			}
+
+	}; // end class ``spmd'' reference implementation
+
+} // namespace grb
+
diff --git a/include/graphblas/hyperdags/vector.hpp b/include/graphblas/hyperdags/vector.hpp
new file mode 100644
index 000000000..5f422399e
--- /dev/null
+++ b/include/graphblas/hyperdags/vector.hpp
@@ -0,0 +1,284 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the vector container for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_VECTOR
+#define _H_GRB_HYPERDAGS_VECTOR
+
+#include <graphblas/config.hpp>
+#include <graphblas/base/pinnedvector.hpp>
+
+
+namespace grb {
+
+	template< typename T, typename RIT, typename CIT, typename NIT >
+	class Matrix< T, hyperdags, RIT, CIT, NIT >;
+
+	namespace internal {
+
+		namespace hyperdags {
+			typedef grb::internal::Coordinates<
+				grb::config::IMPLEMENTATION< grb::hyperdags >::coordinatesBackend()
+			> Coordinates;
+		}
+
+		template< typename T >
+		Vector< T, _GRB_WITH_HYPERDAGS_USING, typename hyperdags::Coordinates > &
+		getVector(
+			Vector< T, grb::hyperdags, typename hyperdags::Coordinates > &
+		);
+
+		template< typename T >
+		const Vector< T, _GRB_WITH_HYPERDAGS_USING, typename hyperdags::Coordinates > &
+		getVector(
+			const Vector< T, grb::hyperdags, typename hyperdags::Coordinates > &x
+		);
+
+		template< typename T>
+		inline const T * getRaw(
+			const Vector<
+				T, grb::hyperdags,
+				typename internal::hyperdags::Coordinates
+			> &x
+		);
+
+		template< typename T>
+		inline T * getRaw(
+			Vector< T, grb::hyperdags, typename internal::hyperdags::Coordinates > &x
+		);
+
+	}
+
+	template< typename T >
+	class Vector< T, hyperdags, internal::hyperdags::Coordinates > {
+
+		template< typename A >
+		friend Vector<
+			A, _GRB_WITH_HYPERDAGS_USING,
+			internal::hyperdags::Coordinates
+		> & internal::getVector(
+			Vector< A, grb::hyperdags, internal::hyperdags::Coordinates > &
+		);
+
+		template< typename A >
+		friend const Vector<
+			A, _GRB_WITH_HYPERDAGS_USING,
+			internal::hyperdags::Coordinates
+		> & internal::getVector(
+			const Vector< A, grb::hyperdags, internal::hyperdags::Coordinates > &
+		);
+
+		friend class PinnedVector< T, hyperdags >;
+
+
+		private:
+
+			/** \internal My own type */
+			typedef Vector< T, hyperdags, internal::hyperdags::Coordinates > SelfType;
+
+			/** \internal Simply use an underlying implementation */
+			typedef Vector<
+				T, grb::_GRB_WITH_HYPERDAGS_USING,
+				internal::hyperdags::Coordinates
+			> MyVectorType;
+
+			/** \internal Iterator type inherited from underlying backend */
+			template< Backend A >
+			using ConstIterator = typename MyVectorType::template ConstIterator< A >;
+
+			/** \internal Simply wrap around underlying backend */
+			MyVectorType vector;
+
+			/** \internal Registers this vector as a source container */
+			void register_vector() {
+#ifdef _DEBUG
+				std::cout << "\t registering vector with pointer " << this << "\n";
+#endif
+				if( size( vector ) > 0 ) {
+					internal::hyperdags::generator.addContainer( getID( vector ) );
+				}
+			}
+
+
+		public:
+
+			typedef typename MyVectorType::const_iterator const_iterator;
+
+			Vector( const size_t n ) : vector( n ) {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) constructor\n";
+#endif
+				register_vector();
+			}
+
+			Vector() : Vector( 0 ) {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) default constructor\n";
+#endif
+			}
+
+			Vector( const SelfType &x ) : vector( x.vector ) {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) copy constructor\n";
+#endif
+				register_vector();
+			}
+
+			Vector( SelfType &&x ) noexcept {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) move constructor\n";
+#endif
+				vector = std::move( x.vector );
+				register_vector();
+			}
+
+			Vector( const size_t n, const size_t nz ) : vector( n, nz ) {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) capacity constructor\n";
+#endif
+				register_vector();
+			}
+
+			~Vector() {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) destructor\n";
+#endif
+			}
+
+			SelfType & operator=( const SelfType &x ) {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) copy assignment\n";
+#endif
+				vector = x.vector;
+				return *this;
+			}
+
+			SelfType & operator=( SelfType &&x ) noexcept {
+#ifdef _DEBUG
+				std::cout << "Vector (hyperdags) move assignment\n";
+#endif
+				vector = std::move( x.vector );
+				return *this;
+			}
+
+			template< Backend spmd_backend = reference >
+			ConstIterator< spmd_backend > cbegin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return vector.cbegin( s, P );
+			}
+
+			template< Backend spmd_backend = reference >
+			ConstIterator< spmd_backend > cend(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return vector.cend( s, P );
+			}
+
+			template< Backend spmd_backend = reference >
+			ConstIterator< spmd_backend > begin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return vector.begin( s, P );
+			}
+
+			template< Backend spmd_backend = reference >
+			ConstIterator< spmd_backend > end(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return vector.end( s, P );
+			}
+
+			T & operator[]( const size_t i ) {
+				return vector[ i ];
+			}
+
+			T & operator[]( const size_t i ) const {
+				return vector[ i ];
+			}
+			/**
+			 * Non-standard data accessor for debug purposes.
+			 *
+			 * \warning Do not use this fucntion.
+			 *
+			 * The user promises to never write to this data when GraphBLAS can operate
+			 * on it. The user understands that data read out may be subject to incoming
+			 * changes caused by preceding GraphBLAS calls.
+			 *
+			 * \warning This function is only defined for the reference and hyperdags backends--
+			 *          thus switching backends may cause your code to not compile.
+			 *
+			 * @return A const reference to the raw data this vector contains.
+			 *
+			 * \note This function is used internally for testing purposes.
+			 */
+			T * raw() const {
+				return vector.raw();
+			}
+
+	};
+
+	namespace internal {
+
+		template< typename T >
+		Vector<
+			T, _GRB_WITH_HYPERDAGS_USING,
+			internal::hyperdags::Coordinates
+		> & getVector(
+			Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+		) {
+			return x.vector;
+		}
+
+		template< typename T >
+		const Vector<
+			T, _GRB_WITH_HYPERDAGS_USING,
+			internal::hyperdags::Coordinates
+		> & getVector(
+			const Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+		) {
+			return x.vector;
+		}
+
+		template< typename T>
+		inline const T * getRaw(
+			const Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+		) {
+			return getRaw(getVector<T>(x));
+		};
+
+		template< typename T>
+		inline T * getRaw(
+			Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+		) {
+			return getRaw(getVector<T>(x));
+		};
+
+	}
+
+}
+
+#endif
+
diff --git a/include/graphblas/identities.hpp b/include/graphblas/identities.hpp
index dd48fcf98..fdbb7c7f7 100644
--- a/include/graphblas/identities.hpp
+++ b/include/graphblas/identities.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides a set of standard identities for use with ALP.
+ *
  * @author A. N. Yzelman
  * @date 11th of August, 2016
  */
@@ -25,6 +29,7 @@
 
 #include <limits>
 
+
 namespace grb {
 
 	/**
@@ -195,3 +200,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/init.hpp b/include/graphblas/init.hpp
index 2b1af0a52..dd34749ba 100644
--- a/include/graphblas/init.hpp
+++ b/include/graphblas/init.hpp
@@ -26,11 +26,16 @@
 #include "backends.hpp"
 #include "base/init.hpp"
 
-
 // include all implementations
 #ifdef _GRB_WITH_REFERENCE
  #include "graphblas/reference/init.hpp"
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/init.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/init.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include "graphblas/bsp1d/init.hpp"
 #endif
diff --git a/include/graphblas/interfaces/pregel.hpp b/include/graphblas/interfaces/pregel.hpp
new file mode 100644
index 000000000..3350b1e0e
--- /dev/null
+++ b/include/graphblas/interfaces/pregel.hpp
@@ -0,0 +1,960 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * This file defines a vertex-centric programming API called ALP/Pregel, which
+ * automatically translates to standard ALP/GraphBLAS primitives.
+ *
+ * @author A. N. Yzelman
+ * @date 2022
+ *
+ * \defgroup Pregel ALP/Pregel
+ * @{
+ *
+ * @brief ALP/Pregel enables vertex-centric programming.
+ *
+ * \par API introduction
+ *
+ * With vertex-centric programming, graph algorithms are written from the
+ * perspective of a vertex within an input graph. Each vertex executes a program
+ * on a round-by-round basis, while between rounds all vertex programs pass
+ * messages to neighbour vertices using the edges of the input graph. Edges may
+ * be directed or undirected; in the former, messages travel from the source
+ * vertex to the destination vertex only. Each vertex program sends the same
+ * message to all of its neighbours -- i.e., it broadcasts a single given
+ * message. In ALP/Pregel, incoming messages are furthermore \em accumulated
+ * using a #grb::Monoid. The accumulation of incoming messages is typically used
+ * by the vertex-centric program during the next round it executes.
+ *
+ * Pregel programs thus execute on a given graph, and hence constructing a
+ * #grb::interfaces::Pregel instance requires passing input iterators
+ * corresponding to the graph on which ALP/Pregel programs are executed. Such an
+ * instance logically corresponds to an execution engine of vertex-centric
+ * programs <em>for a specific graph</em>. Multiple #grb::interfaces::Pregel
+ * instances, each potentially built using a different input graph, may exist
+ * simultaneously.
+ *
+ * ALP/Pregel programs then are executed using #grb::interfaces::Pregel::execute.
+ * The first template argument to this function is the binary operator of the
+ * monoid to be used for accumalating incoming messages, while the second
+ * template argument corresponds to its identity-- see #grb::operators and
+ * #grb::identities for example operator and identities. The remainder template
+ * arguments to #grb::interfaces::Pregel::execute are automatically inferred.
+ *
+ * The first non-template argument is the vertex-centric program, for example,
+ * #grb::algorithms::pregel::ConnectedComponents-- a vertex-centric program in
+ * ALP/GraphBLAS hence is a class where the program is given as a public static
+ * function named \em program. This function takes five arguments:
+ *  1. the current state of the vertex (read-write),
+ *  2. the incoming message (after accumulation, read only),
+ *  3. the outgoing message (read-write),
+ *  4. the global program parameters (read only), and
+ *  5. the Pregel interface state (read only and read-write).
+ *
+ * The types of arguments 1-4 are defined by the program, but must be plain old
+ * data (POD) types-- similar to the requirements of an ALP operator. An example
+ * of an ALP/Pregel algorithm that has non-trivial algorithm parameters is
+ * #grb::algorithms::pregel::PageRank: #grb::algorithms::pregel::PageRank::Data.
+ *
+ * The type of the 5th argument to #grb::interfaces::Pregel::execute is an
+ * instance of #grb::interfaces::PregelState. Some of the ALP/Pregel state
+ * fields are read-only, such as the current round number
+ * #grb::interfaces::PregelState::round, while others are read-write.
+ * Please see the corresponding documentation for what read-only states may be
+ * inspected during program execution. Some fields are global (such as again the
+ * current round number), while others are specific to the vertex a program is
+ * running on (such as #grb::interfaces::PregelState::indegree).
+ *
+ * Read-write ALP/Pregel state is used for determining termination conditions.
+ * There are two associated flags:
+ *  1. #grb::interfaces::PregelState::active, and
+ *  2. #grb::interfaces::PregelState::voteToHalt.
+ *
+ * Each vertex has its own state of these two flags, with the defaults being
+ * <tt>true</tt> for the former and <tt>false</tt> for the latter.
+ *
+ * If, by the end of any round, a vertex sets its <tt>active</tt> flag to
+ * <tt>false</tt>, that vertex will not participate in any future rounds. For
+ * any neighbouring vertices it shall be as though the inactive vertex keeps
+ * broadcasting the identity of the given accumulation monoid.
+ *
+ * If at the end of any round all vertices are inactive, the program terminates.
+ * Similarly, if by the end of a round \em all vertices have the
+ * <tt>voteToHalt</tt> flag set to <tt>true</tt>, then that Pregel program
+ * terminates as well.
+ *
+ * \par Using vertex-centric algorithms
+ *
+ * By convention, ALP/Pregel algorithms allow for a simplified way of executing
+ * them that does not require the Pregel algorithm user to pass the right monoid
+ * to #grb::interfaces::Pregel::execute each time they call one, such as, for
+ * example,
+ *  - #grb::algorithms::pregel::ConnectedComponents::execute, or
+ *  - #grb::algorithms::pregel::PageRank::execute.
+ *
+ * These functions only take the Pregel instance that is to execute the Pregel
+ * program, as well as a vector of initial states as mandatory input. As usual,
+ * optional parameters indicate the maximum number of rounds allotted to the
+ * program (zero for unbounded), and where to write back the number of rounds
+ * after which the program has terminated (<tt>NULL</tt> for no write back).
+ *
+ * All pre-defined ALP/Pregel algorithms reside in the #grb::algorithms::pregel
+ * namespace.
+ *
+ * \par Configuration settings
+ *
+ * The ALP/Pregel run-time system manages state for every vertex in the
+ * underlying graph. The execution time of a single round is always proportional
+ * to the number of active vertices. Since inactive vertices stay inactive in
+ * subsequent rounds, their state could be erased. This has two \em potential
+ * benefits:
+ *  1. it \em may (depending on the used backend's performance semantics) reduce
+ *     memory use; and/or
+ *  2. it \em may result in faster execution (depending on the used backend's
+ *     performance semantics).
+ *
+ * We may opt to always attempt to <i>sparsify</i> state, use some heuristic to
+ * determine when to sparsify, or just simply never attempt such sparsification.
+ *
+ * This choice is configurable via #grb::interfaces::config::out_sparsify; see
+ * #grb::interfaces::config::SparsificationStrategy for options and more
+ * details.
+ *
+ * @}
+ */
+
+#ifndef _H_GRB_INTERFACES_PREGEL
+#define _H_GRB_INTERFACES_PREGEL
+
+#include <graphblas.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <stdexcept> // std::runtime_error
+
+
+namespace grb {
+
+	namespace interfaces {
+
+		/**
+		 * Contains configurations for programming models that are simulated on top of
+		 * ALP/GraphBLAS.
+		 */
+		namespace config {
+
+			/**
+			 * The set of sparsification strategies supported by the ALP/Pregel
+			 * interface.
+			 *
+			 * \ingroup Pregel
+			 */
+			enum SparsificationStrategy {
+
+				/**
+				 * No sparsification of internal and user-defined vertex states, beyond that
+				 * which is necessary to bound the run-time by the number of active
+				 * vertices.
+				 */
+				NONE = 0,
+
+				/**
+				 * Always applies the sparsification procedure on both internal and user-
+				 * defined vertex states.
+				 *
+				 * Does not consider whether the resulting operation would reduce the number
+				 * of vertex entries.
+				 *
+				 * This variant was tested against #NONE for #out_sparsify, and found to be
+				 * slower always.
+				 *
+				 * \internal This strategy necessarily always applied on the
+				 *           #Pregel::ActiveVertices vector.
+				 */
+				ALWAYS,
+
+				/**
+				 * Sparsify only when the resulting vector would indeed be sparser.
+				 *
+				 * While this sounds like it should be a minimal condition to check for
+				 * before applying sparsification, this check itself comes at non-trivial
+				 * overhead for any backend. The performance of this strategy versus
+				 * #ALWAYS hence is a trade-off, one that varies with underlying graphs
+				 * as well as with the vertex-centric program chosen.
+				 *
+				 * \internal
+				 * \note This strategy should \em not be applied to #Pregel::ActiveVertices
+				 *       since doing so requires computing the number of active vertices,
+				 *       which has the same complexity as actually sparsifying that vector.
+				 *
+				 * \todo This variant has never been exhaustively tested for
+				 *       \a out_sparsify.
+				 * \endinternal
+				 */
+				WHEN_REDUCED,
+
+				/**
+				 * Sparsify only when the resulting vector would have half (or less) its
+				 * current number of nonzeroes. This is a simple heuristic that balances
+				 * the trade-off of \em applying sparsification by amortising its overhead.
+				 * The overhead described at #WHEN_REDUCED corresponding to determining the
+				 * gain of sparsification, however, remains the same.
+				 *
+				 * \internal
+				 * \note This strategy should \em not be applied to #Pregel::ActiveVertices
+				 *       since doing so requires computing the number of active vertices,
+				 *       which has the same complexity as actually sparsifying that vector.
+				 *
+				 * \todo This variant has never been exhaustively tested for
+				 *       \a out_sparsify.
+				 * \endinternal
+				 */
+				WHEN_HALVED
+
+			};
+
+			/**
+			 * What sparsification strategy should be applied to the outgoing
+			 * messages.
+			 *
+			 * \internal
+			 * Only #NONE and #ALWAYS have been tested, with #NONE being faster on all
+			 * test cases.
+			 * \endinternal
+			 *
+			 * \ingroup Pregel
+			 */
+			constexpr const SparsificationStrategy out_sparsify = NONE;
+
+		} // end namespace grb::interfaces::config
+
+		/**
+		 * The state of the vertex-center Pregel program that the user may interface
+		 * with.
+		 *
+		 * The state includes global data as well as vertex-centric state. The global
+		 * state is umodifiable and includes:
+		 *  - #grb::interfaces::PregelState::num_vertices,
+		 *  - #grb::interfaces::PregelState::num_edges, and
+		 *  - #grb::interfaces::PregelState::round.
+		 *
+		 * Vertex-centric state can be either constant or modiable:
+		 *  - static vertex-centric state: #grb::interfaces::PregelState::indegree,
+		 *    #grb::interfaces::PregelState::outdegree, and
+		 *    #grb::interfaces::PregelState::vertexID.
+		 *  - modifiable vertex-centric state:
+		 *    #grb::interfaces::PregelState::voteToHalt, and
+		 *    #grb::interfaces::PregelState::active.
+		 *
+		 * \ingroup Pregel
+		 */
+		struct PregelState {
+
+			/**
+			 * Represents whether the current vertex is active.
+			 *
+			 * Since this struct is only to-be used within the computational phase of a
+			 * vertex-centric program, this always reads <tt>true</tt> on the start of a
+			 * round.
+			 *
+			 * The program may set this field to <tt>false</tt> which will cause this
+			 * vertex to no longer trigger computational steps during subsequent rounds.
+			 *
+			 * An inactive vertex will no longer broadcast messages.
+			 *
+			 * If all vertices are inactive the program terminates.
+			 */
+			bool &active;
+
+			/**
+			 * Represents whether this (active) vertex votes to terminate the program.
+			 *
+			 * On start of a round, this entry is set to <tt>false</tt>. If all active
+			 * vertices set this to <tt>true</tt>, the program will terminate after the
+			 * current round.
+			 */
+			bool &voteToHalt;
+
+			/**
+			 * The number of vertices in the global graph.
+			 */
+			const size_t &num_vertices;
+
+			/**
+			 * The number of edges in the global graph.
+			 */
+			const size_t &num_edges;
+
+			/**
+			 * The out-degree of this vertex.
+			 */
+			const size_t &outdegree;
+
+			/**
+			 * The in-degree of this vertex.
+			 */
+			const size_t &indegree;
+
+			/**
+			 * The current round the vertex-centric program is currently executing.
+			 */
+			const size_t &round;
+
+			/**
+			 * A unique ID of this vertex.
+			 *
+			 * This number is an unsigned integer between 0 (inclusive) and
+			 * the number of vertices the underlying graph holds (exclusive).
+			 */
+			const size_t &vertexID;
+
+		};
+
+		/**
+		 * A Pregel run-time instance.
+		 *
+		 * Pregel wraps around graph data and executes computations on said graph. A
+		 * runtime thus is constructed from graph, and enables running any Pregel
+		 * algorithm on said graph.
+		 *
+		 * \ingroup Pregel
+		 */
+		template<
+			typename MatrixEntryType
+		>
+		class Pregel {
+
+			private:
+
+				/** \internal The number of vertices of the underlying #graph. */
+				const size_t n;
+
+				/** \internal The number of edges of the underlying #graph. */
+				size_t nz;
+
+				/** \internal The graph to run vertex-centric programs over. */
+				grb::Matrix< MatrixEntryType > graph;
+
+				/** \internal Which vertices are still active. */
+				grb::Vector< bool > activeVertices;
+
+				/** \internal Which vertices voted to halt. */
+				grb::Vector< bool > haltVotes;
+
+				/** \internal A buffer used to sparsify #activeVertices. */
+				grb::Vector< bool > buffer;
+
+				/** \internal Pre-computed outdegrees. */
+				grb::Vector< size_t > outdegrees;
+
+				/** \internal Pre-cominputed indegrees. */
+				grb::Vector< size_t > indegrees;
+
+				/** \internal Global vertex IDs. */
+				grb::Vector< size_t > IDs;
+
+				/**
+				 * \internal
+				 * Initialises the following fields:
+				 *   -# outdegrees
+				 *   -# indegrees
+				 *   -# IDs
+				 * Other fields are set on program start.
+				 * \endinternal
+				 */
+				void initialize() {
+					grb::Semiring<
+						grb::operators::add< size_t >,
+						grb::operators::right_assign_if< bool, size_t, size_t >,
+						grb::identities::zero,
+						grb::identities::logical_true
+					> ring;
+					grb::Vector< size_t > ones( n );
+					if( grb::set( ones, 1 ) != SUCCESS ) {
+						throw std::runtime_error( "Could not set vector ones" );
+					}
+					if( grb::set( outdegrees, 0 ) != SUCCESS ) {
+						throw std::runtime_error( "Could not initialise outdegrees" );
+					}
+					if( grb::mxv< grb::descriptors::dense >(
+							outdegrees, graph, ones, ring
+						) != SUCCESS
+					) {
+						throw std::runtime_error( "Could not compute outdegrees" );
+					}
+					if( grb::set( indegrees, 0 ) != SUCCESS ) {
+						throw std::runtime_error( "Could not initialise indegrees" );
+					}
+					if( grb::mxv<
+						grb::descriptors::dense | grb::descriptors::transpose_matrix
+					>(
+						indegrees, graph, ones, ring
+					) != SUCCESS ) {
+						throw std::runtime_error( "Could not compute indegrees" );
+					}
+					if( grb::set< grb::descriptors::use_index >(
+							IDs, 0
+						) != SUCCESS
+					) {
+						throw std::runtime_error( "Could not compute vertex IDs" );
+					}
+				}
+
+
+			protected:
+
+				/**
+				 * \internal
+				 * Internal constructor for the cases where the number of vertix IDs,
+				 * \a _n, is already known.
+				 * \endinternal
+				 */
+				template< typename IType >
+				Pregel(
+					const size_t _n,
+					IType _start, const IType _end,
+					const grb::IOMode _mode
+				) :
+					n( _n ),
+					graph( _n, _n ),
+					activeVertices( _n ),
+					haltVotes( _n ),
+					buffer( _n ),
+					outdegrees( _n ),
+					indegrees( _n ),
+					IDs( _n )
+				{
+					if( grb::ncols( graph ) != grb::nrows( graph ) ) {
+						throw std::runtime_error( "Input graph is bipartite" );
+					}
+					if( grb::buildMatrixUnique(
+						graph, _start, _end, _mode
+					) != SUCCESS ) {
+						throw std::runtime_error( "Could not build graph" );
+					}
+					nz = grb::nnz( graph );
+					initialize();
+				}
+
+
+			public:
+
+				/**
+				 * Constructs a Pregel instance from input iterators over some graph.
+				 *
+				 * @tparam IType The type of the input iterator.
+				 *
+				 * @param[in] _m The maximum vertex ID for excident edges.
+				 * @param[in] _n The maximum vertex ID for incident edges.
+				 *
+				 * \note This is equivalent to the row- and column- size of an input matrix
+				 *       which represents the input graph.
+				 *
+				 * \note If these values are not known, please scan the input iterators to
+				 *       derive these values prior to calling this constructor. On
+				 *       compelling reasons why such functionality would be useful to
+				 *       provide as a standard factory method, please feel welcome to submit
+				 *       an issue.
+				 *
+				 * \warning The graph is assumed to have contiguous IDs -- i.e., every
+				 *          vertex ID in the range of 0 (inclusive) to the maximum of \a m
+				 *          and \a n (exclusive) has at least one excident or at least one
+				 *          incident edge.
+				 *
+				 * @param[in] _start An iterator pointing to the start element of an
+				 *                   a collection of edges.
+				 * @param[in] _end   An iterator matching \a _start in end position.
+				 *
+				 * All edges to be ingested thus are contained within \a _start and \a end.
+				 *
+				 * @param[in] _mode Whether sequential or parallel I/O is to be used.
+				 *
+				 * The value of \a _mode only takes effect when there are multiple user
+				 * processes, such as for example when executing over a distributed-memory
+				 * cluster. The choice between sequential and parallel I/O should be thus:
+				 *  - If the edges pointed to by \a _start and \a _end correspond to the
+				 *    \em entire set of edges on \em each process, then the I/O mode should
+				 *    be #grb::SEQUENTIAL;
+				 *  - If the edges pointed to by \a _start and \a _end correspond to
+				 *    \em different sets of edges on each different process while their
+				 *    union represents the graph to be ingested, then the I/O mode should be
+				 *    #grb::PARALLEL.
+				 *
+				 * On errors during ingestion, this constructor throws exceptions.
+				 */
+				template< typename IType >
+				Pregel(
+					const size_t _m, const size_t _n,
+					IType _start, const IType _end,
+					const grb::IOMode _mode
+				) : Pregel( std::max( _m, _n ), _start, _end, _mode ) {}
+
+				/**
+				 * Executes a given vertex-centric \a program on this graph.
+				 *
+				 * The program must be a static function that returns void and takes five
+				 * input arguments:
+				 *  - a reference to a vertex-defined state. The type of this reference may
+				 *    be defined by the program, but has to match the element type of
+				 *    \a vertex_state passed to this function.
+				 *  - a const-reference to an incoming message. The type of this reference
+				 *    may be defined by the program, but has to match the element type of
+				 *    \a in passed to this function. It must furthermore be compatible with
+				 *    the domains of \a Op (see below).
+				 *  - a reference to an outgoing message. The type of this reference may be
+				 *    defined by the program, but has to match the element type of \a out
+				 *    passed to this function. It must furthermore be compatible with the
+				 *    domains of \a Op (see below).
+				 *  - a const-reference to a program-defined type. The function of this
+				 *    argument is to collect global read-only algorithm parameters.
+				 *  - a reference to an instance of #grb::interfaces::PregelState. The
+				 *    function of this argument is two-fold: 1) make available global read-
+				 *    only statistics of the graph the algorithm is executing on, and to 2)
+				 *    control algorithm termination conditions.
+				 *
+				 * The program will be called during each round of a Pregel computation. The
+				 * program is expected to compute something based on the incoming message
+				 * and vertex-local state, and (optionally) generate an outgoing message.
+				 * After each round, the outgoing message at all vertices are broadcast to
+				 * all its neighbours. The Pregel runtime, again for each vertex, reduces
+				 * all incoming messages into a single message, after which the next round
+				 * of computation starts, after which the procedure is repeated.
+				 *
+				 * The program terminates in one of two ways:
+				 *  1. there are no more active vertices; or
+				 *  2. all active vertices vote to halt.
+				 *
+				 * On program start, i.e., during the first round, all vertices are active.
+				 * During the computation phase, any vertex can set itself inactive for
+				 * subsequent rounds by setting #grb::interfaces::PregelState::active to
+				 * <tt>false</tt>. Similarly, any active vertex can vote to halt by setting
+				 * #grb::interfaces::PregelState::voteToHalt to <tt>true</tt>.
+				 *
+				 * Reduction of incoming messages to a vertex will occur through an user-
+				 * defined monoid given by:
+				 *
+				 * @tparam Op The binary operation of the monoid. This includes its domain.
+				 * @tparam Id The identity element of the monoid.
+				 *
+				 * The following template arguments will be automatically inferred:
+				 *
+				 * @tparam Program             The type of the program to-be executed.
+				 * @tparam IOType              The type of the state of a single vertex.
+				 * @tparam GlobalProgramData   The type of globally accessible read-only
+				 *                             program data.
+				 * @tparam IncomingMessageType The type of an incoming message.
+				 * @tparam OutgoingMessageType The type of an outgoing message.
+				 *
+				 * The arguments to this function are as follows:
+				 *
+				 * @param[in] program The vertex-centric program to execute.
+				 *
+				 * The same Pregel runtime instance hence can be re-used to execute multiple
+				 * algorithms on the same graph.
+				 *
+				 * Vertex-centric programs have both vertex-local and global state:
+				 *
+				 * @param[in] vertex_state A vector that contains the state of each vertex.
+				 * @param[in] data         Global read-only state for the given \a program.
+				 *
+				 * The capacity, size, and number of nonzeroes of \a vertex_state must equal
+				 * the maximum vertex ID.
+				 *
+				 * Finally, in the ALP spirit which aims to control all relevant performance
+				 * aspects, the workspace required by the Pregel runtime must be pre-
+				 * allocated and passed in:
+				 *
+				 * @param[in] in  Where incoming messages are stored. Any initial values may
+				 *                or may not be ignored, depending on the \a program
+				 *                behaviour during the first round of computation.
+				 *
+				 * @param[in] out Where outgoing messages are stored. Any initial values
+				 *                will be ignored.
+				 *
+				 * The capacities and sizes of \a in and \a out must equal the maximum vertex
+				 * ID. For sparse vectors \a in with more than zero nonzeroes, all initial
+				 * contents will be overwritten by the identity of the reduction monoid. Any
+				 * initial contents for \a out will always be ignored as every round of
+				 * computation starts with the outgoing message set to the monoid identity.
+				 *
+				 * \note Thus if the program requires some initial incoming messages to be
+				 *       present during the first round of computation, those may be passed
+				 *       as part of a dense vectors \a in.
+				 *
+				 * The contents of \a in and \a out after termination of a vertex-centric
+				 * function are undefined, including when this function returns
+				 * #grb::SUCCESS. Output of the program should be part of the vertex-centric
+				 * state recorded in \a vertex_state.
+				 *
+				 * Some statistics are returned after a vertex-centric program terminates:
+				 *
+				 * @param[out] rounds The number of rounds the Pregel program has executed.
+				 *                    The initial value to \a rounds will be ignored.
+				 *
+				 * The contents of this field shall be undefined when this function does not
+				 * return #grb::SUCCESS.
+				 *
+				 * Vertex-programs execute in rounds and could, if the given program does
+				 * not infer proper termination conditions, run forever. To curb the number
+				 * of rounds, the following \em optional parameter may be given:
+				 *
+				 * @param[in] out_buffer An optional buffer area that should only be set
+				 *                       whenever the #config::out_sparsify configuration
+				 *                       parameter is not set to #config::NONE. If that is
+				 *                       the case, then \a out_buffer should have size and
+				 *                       capacity equal to the maximum vertex ID.
+				 *
+				 * @param[in] max_rounds The maximum number of rounds the \a program may
+				 *                       execute. Once reached and not terminated, the
+				 *                       program will forcibly terminate.
+				 *
+				 * To turn off termination after a maximum number of rounds, \a max_rounds
+				 * may be set to zero. This is also the default.
+				 *
+				 * Executing a Pregel function returns one of the following error codes:
+				 *
+				 * @returns #grb::SUCCESS  The \a program executed (and terminated)
+				 *                         successfully.
+				 * @returns #grb::MISMATCH At least one of \a vertex_state, \a in, or \a out
+				 *                         is not of the required size.
+				 * @returns #grb::ILLEGAL  At least one of \a vertex_state, \a in, or \a out
+				 *                         does not have the required capacity.
+				 * @returns #grb::ILLEGAL  If \a vertex_state is not dense.
+				 * @returns #grb::PANIC    In case an unrecoverable error was encountered
+				 *                         during execution.
+				 */
+				template<
+					class Op,
+					template< typename > class Id,
+					class Program,
+					typename IOType,
+					typename GlobalProgramData,
+					typename IncomingMessageType,
+					typename OutgoingMessageType
+				>
+				grb::RC execute(
+					const Program program,
+					grb::Vector< IOType > &vertex_state,
+					const GlobalProgramData &data,
+					grb::Vector< IncomingMessageType > &in,
+					grb::Vector< OutgoingMessageType > &out,
+					size_t &rounds,
+					grb::Vector< OutgoingMessageType > &out_buffer =
+						grb::Vector< OutgoingMessageType >(0),
+					const size_t max_rounds = 0
+				) {
+					static_assert( grb::is_operator< Op >::value &&
+							grb::is_associative< Op >::value,
+						"The combiner must be an associate operator"
+					);
+					static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+						"The combiner left-hand input domain should match the incoming message "
+						"type." );
+					static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+						"The combiner right-hand input domain should match the incoming message "
+						"type." );
+					static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+						"The combiner output domain should match the incoming message type." );
+
+					// set default output
+					rounds = 0;
+
+					// sanity checks
+					if( grb::size(vertex_state) != n ) {
+						return MISMATCH;
+					}
+					if( grb::size(in) != n ) {
+						return MISMATCH;
+					}
+					if( grb::size(out) != n ) {
+						return MISMATCH;
+					}
+					if( grb::capacity(vertex_state) != n ) {
+						return ILLEGAL;
+					}
+					if( grb::capacity(in) != n ) {
+						return ILLEGAL;
+					}
+					if( grb::capacity(out) != n ) {
+						return ILLEGAL;
+					}
+					if( config::out_sparsify && grb::capacity(out_buffer) != n ) {
+						return ILLEGAL;
+					}
+					if( grb::nnz(vertex_state) != n ) {
+						return ILLEGAL;
+					}
+
+					// define some monoids and semirings
+					grb::Monoid<
+						grb::operators::logical_or< bool >,
+						grb::identities::logical_false
+					> orMonoid;
+
+					grb::Monoid<
+						grb::operators::logical_and< bool >,
+						grb::identities::logical_true
+					> andMonoid;
+
+					grb::Semiring<
+						Op,
+						grb::operators::left_assign_if<
+							IncomingMessageType, bool, IncomingMessageType
+						>,
+						Id,
+						grb::identities::logical_true
+					> ring;
+
+					// set initial round ID
+					size_t step = 0;
+
+					// activate all vertices
+					grb::RC ret = grb::set( activeVertices, true );
+
+					// initialise halt votes to all-false
+					if( ret == SUCCESS ) {
+						ret = grb::set( haltVotes, false );
+					}
+
+					// set default incoming message
+					if( ret == SUCCESS && grb::nnz(in) < n ) {
+#ifdef _DEBUG
+						if( grb::nnz(in) > 0 ) {
+							std::cerr << "Overwriting initial incoming messages since it was not a "
+								<< "dense vector\n";
+						}
+#endif
+						ret = grb::set( in, Id< IncomingMessageType >::value() );
+					}
+
+					// reset outgoing buffer
+					size_t out_nnz = n;
+					if( ret == SUCCESS ) {
+						ret = grb::set( out, Id< OutgoingMessageType >::value() );
+					}
+
+					// return if initialisation failed
+					if( ret != SUCCESS ) {
+						assert( ret == FAILED );
+						std::cerr << "Error: initialisation failed, but if workspace holds full "
+							<< "capacity, initialisation should never fail. Please submit a bug "
+							<< "report.\n";
+						return PANIC;
+					}
+
+					// while there are active vertices, execute
+					while( ret == SUCCESS ) {
+
+						assert( max_rounds == 0 || step < max_rounds );
+						// run one step of the program
+						ret = grb::eWiseLambda(
+							[
+								this,
+								&vertex_state,
+								&in,
+								&out,
+								&program,
+								step,
+								&data
+							]( const size_t i ) {
+								// create Pregel struct
+								PregelState pregel = {
+									activeVertices[ i ],
+									haltVotes[ i ],
+									n,
+									nz,
+									outdegrees[ i ],
+									indegrees[ i ],
+									step,
+									IDs[ i ]
+								};
+								// only execute program on active vertices
+								assert( activeVertices[ i ] );
+#ifdef _DEBUG
+								std::cout << "Vertex " << i << " remains active in step " << step
+									<< "\n";
+#endif
+								program(
+									vertex_state[ i ],
+									in[ i ],
+									out[ i ],
+									data,
+									pregel
+								);
+#ifdef _DEBUG
+								std::cout << "Vertex " << i << " sends out message " << out[ i ]
+									<< "\n";
+#endif
+							}, activeVertices, vertex_state, in, out, outdegrees, haltVotes, indegrees, IDs
+						);
+
+						// increment counter
+						(void) ++step;
+
+						// check if everyone voted to halt
+						if( ret == SUCCESS ) {
+							bool halt = true;
+							ret = grb::foldl< grb::descriptors::structural >(
+								halt, haltVotes, activeVertices, andMonoid
+							);
+							assert( ret == SUCCESS );
+							if( ret == SUCCESS && halt ) {
+#ifdef _DEBUG
+								std::cout << "\t All active vertices voted to halt; "
+									<< "terminating Pregel program.\n";
+#endif
+								break;
+							}
+						}
+
+						// update active vertices
+						if( ret == SUCCESS ) {
+#ifdef _DEBUG
+							std::cout << "\t Number of active vertices was "
+								<< grb::nnz( activeVertices ) << ", and ";
+#endif
+							ret = grb::clear( buffer );
+							ret = ret ? ret : grb::set( buffer, activeVertices, true );
+							std::swap( buffer, activeVertices );
+#ifdef _DEBUG
+							std::cout << " has now become " << grb::nnz( activeVertices ) << "\n";
+#endif
+						}
+
+						// check if there is a next round
+						const size_t curActive = grb::nnz( activeVertices );
+						if( ret == SUCCESS && curActive == 0 ) {
+#ifdef _DEBUG
+							std::cout << "\t All vertices are inactive; "
+								<< "terminating Pregel program.\n";
+#endif
+							break;
+						}
+
+						// check if we exceed the maximum number of rounds
+						if( max_rounds > 0 && step > max_rounds ) {
+#ifdef _DEBUG
+							std::cout << "\t Maximum number of Pregel rounds met "
+								<< "without the program returning a valid termination condition. "
+								<< "Exiting prematurely with a FAILED error code.\n";
+#endif
+							ret = FAILED;
+							break;
+						}
+
+#ifdef _DEBUG
+						std::cout << "\t Starting message exchange\n";
+#endif
+
+						// reset halt votes
+						if( ret == SUCCESS ) {
+							ret = grb::clear( haltVotes );
+							ret = ret ? ret : grb::set< grb::descriptors::structural >(
+								haltVotes, activeVertices, false
+							);
+						}
+
+						// reset incoming buffer
+						if( ret == SUCCESS ) {
+							ret = grb::clear( in );
+							ret = ret ? ret : grb::set< grb::descriptors::structural >(
+								in, activeVertices, Id< IncomingMessageType >::value()
+							);
+						}
+
+						// execute communication
+						if( ret == SUCCESS ) {
+							ret = grb::vxm< grb::descriptors::structural >(
+								in, activeVertices, out, graph, ring
+							);
+						}
+
+						// sparsify and reset outgoing buffer
+						if( config::out_sparsify && ret == SUCCESS ) {
+							if( config::out_sparsify == config::ALWAYS ||
+								(config::out_sparsify == config::WHEN_REDUCED && out_nnz > curActive) ||
+								(config::out_sparsify == config::WHEN_HALVED && curActive <= out_nnz/2)
+							) {
+								ret = grb::clear( out_buffer );
+								ret = ret ? ret : grb::set< grb::descriptors::structural >(
+										out_buffer, activeVertices, Id< OutgoingMessageType >::value()
+									);
+								std::swap( out, out_buffer );
+								out_nnz = curActive;
+							}
+						}
+
+#ifdef _DEBUG
+						std::cout << "\t Resetting outgoing message fields and "
+							<< "starting next compute round\n";
+#endif
+
+					}
+
+#ifdef _DEBUG
+					if( grb::spmd<>::pid() == 0 ) {
+						std::cout << "Info: Pregel exits after " << step
+							<< " rounds with error code " << ret
+							<< " ( " << grb::toString(ret) << " )\n";
+					}
+#endif
+
+					// done
+					rounds = step;
+					return ret;
+				}
+
+				/**
+				 * Queries the maximum vertex ID for programs running on this Pregel
+				 * instance.
+				 *
+				 * @returns The maximum vertex ID.
+				 */
+				size_t num_vertices() const noexcept { return n; }
+
+				/**
+				 * Queries the number of edges of the graph this Pregel instance has been
+				 * constructed over.
+				 *
+				 * @returns The number of edges within the underlying graph.
+				 */
+				size_t num_edges() const noexcept { return nz; }
+
+				/**
+				 * Returns the ALP/GraphBLAS matrix representation of the underlying
+				 * graph.
+				 *
+				 * This is useful when an application prefers to sometimes use vertex-
+				 * centric algorithms and other times prefers direct ALP/GraphBLAS
+				 * algorithms.
+				 *
+				 * @returns The underlying ALP/GraphBLAS matrix corresponding to the
+				 *          underlying graph.
+				 */
+				const grb::Matrix< MatrixEntryType > & get_matrix() const noexcept {
+					return graph;
+				}
+
+		};
+
+	} // end namespace ``grb::interfaces''
+
+} // end namespace ``grb''
+
+#endif // end ``_H_GRB_INTERFACES_PREGEL''
+
diff --git a/include/graphblas/io.hpp b/include/graphblas/io.hpp
index 9d09b95d2..8fbb70a13 100644
--- a/include/graphblas/io.hpp
+++ b/include/graphblas/io.hpp
@@ -29,6 +29,12 @@
 #ifdef _GRB_WITH_REFERENCE
  #include <graphblas/reference/io.hpp>
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/io.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/io.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/io.hpp>
 #endif
diff --git a/include/graphblas/iomode.hpp b/include/graphblas/iomode.hpp
index d93a31c63..7bf16559e 100644
--- a/include/graphblas/iomode.hpp
+++ b/include/graphblas/iomode.hpp
@@ -15,7 +15,12 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines the various I/O modes a user could employ with ALP data ingestion
+ * or extraction.
+ *
  * @author A. N. Yzelman
  * @date 21st of February, 2017
  */
@@ -23,6 +28,7 @@
 #ifndef _H_GRB_IOMODE
 #define _H_GRB_IOMODE
 
+
 namespace grb {
 
 	/**
@@ -86,8 +92,7 @@ namespace grb {
 		PARALLEL
 	};
 
-	/** @} */
-
 } // namespace grb
 
 #endif // end ``_H_GRB_IOMODE''
+
diff --git a/include/graphblas/matrix.hpp b/include/graphblas/matrix.hpp
index 08d715df5..557a40c66 100644
--- a/include/graphblas/matrix.hpp
+++ b/include/graphblas/matrix.hpp
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-/*
+/**
  * @author A. N. Yzelman
- * @date 10 of August
+ * @date 10 of August, 2016
  */
 
 #ifndef _H_GRB_MATRIX
@@ -30,6 +30,12 @@
 #ifdef _GRB_WITH_REFERENCE
  #include <graphblas/reference/matrix.hpp>
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/matrix.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/matrix.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/matrix.hpp>
 #endif
@@ -51,3 +57,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_MATRIX''
+
diff --git a/include/graphblas/monoid.hpp b/include/graphblas/monoid.hpp
index 56f21b1a8..bd3b65195 100644
--- a/include/graphblas/monoid.hpp
+++ b/include/graphblas/monoid.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides an ALP monoid.
+ *
  * @author A. N. Yzelman
  * @date 15 March, 2016
  */
@@ -37,12 +41,7 @@
 #include <graphblas/ops.hpp>
 #include <graphblas/type_traits.hpp>
 
-/**
- * The main Sparse Library namespace.
- *
- * All classes, enums, constants, and functions are declared in this namespace.
- * This source file only contains testing code outside this namespace.
- */
+
 namespace grb {
 
 	/**
@@ -65,6 +64,7 @@ namespace grb {
 			"one of its input domains" );
 
 	public:
+
 		/** The left-hand side input domain. */
 		typedef typename _OP::D1 D1;
 
@@ -81,7 +81,9 @@ namespace grb {
 		template< typename IdentityType >
 		using Identity = _ID< IdentityType >;
 
+
 	private:
+
 		/**
 		 * The underlying binary operator.
 		 *
@@ -136,3 +138,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/nonblocking/alloc.hpp b/include/graphblas/nonblocking/alloc.hpp
new file mode 100644
index 000000000..2938d6755
--- /dev/null
+++ b/include/graphblas/nonblocking/alloc.hpp
@@ -0,0 +1,65 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Allocator functions for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_ALLOC_NONBLOCKING
+#define _H_GRB_ALLOC_NONBLOCKING
+
+#include <iostream>
+
+#include <graphblas/base/alloc.hpp>
+
+#include "config.hpp"
+
+
+namespace grb {
+
+	namespace utils {
+
+		namespace internal {
+
+			template<>
+			class Allocator< nonblocking > {
+
+				private:
+
+					/** Prevent initialisation. */
+					Allocator();
+
+				public:
+
+					/** Refer to the standard allocation mechanism. */
+					typedef AllocatorFunctions< reference > functions;
+
+			};
+
+		} // namespace internal
+
+	} // namespace utils
+
+} // namespace grb
+
+#endif
+
diff --git a/include/graphblas/nonblocking/analytic_model.hpp b/include/graphblas/nonblocking/analytic_model.hpp
new file mode 100644
index 000000000..536b3e95b
--- /dev/null
+++ b/include/graphblas/nonblocking/analytic_model.hpp
@@ -0,0 +1,122 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Configurations for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_UTILS_ANALYTIC_MODEL
+#define _H_GRB_UTILS_ANALYTIC_MODEL
+
+#include "config.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * The analytic model used for automatic tile size selection and for
+		 * automatic number of threads selection.
+		 */
+		class AnalyticModel {
+
+			private:
+
+				/**
+				 * The size of the data type of the containers (may vary between different
+				 * containers). The current design uses the maximum size of all used data
+				 * types.
+				 */
+				size_t size_of_data_type;
+
+				/**
+				 * The size of the containers accessed in the pipeline.
+				 */
+				size_t size_of_vector;
+
+				/**
+				 * The number of vectors accessed in the pipeline.
+				 */
+				size_t num_accessed_vectors;
+
+				/**
+				 * The number of threads selected by the analytic model.
+				 */
+				size_t num_threads;
+
+				/**
+				 * The tile size selected by the analytic model.
+				 */
+				size_t tile_size;
+
+				/**
+				 * The number of total tiles that result from the selected tile size.
+				 */
+				size_t num_tiles;
+
+
+			public:
+
+				/**
+				 * The default constructor.
+				 */
+				AnalyticModel() noexcept;
+
+				/**
+				 * The parameterized constructor.
+				 */
+				AnalyticModel(
+					const size_t data_type_size,
+					const size_t vector_size,
+					const size_t accessed_vectors
+				) noexcept;
+
+				/**
+				 * A getter function that returns the size of the containers.
+				 */
+				size_t getVectorsSize() const noexcept;
+
+				/**
+				 * A getter function that returns the number of threads selected by
+				 * the analytic model.
+				 */
+				size_t getNumThreads() const noexcept;
+
+				/**
+				 * A getter function that returns the tile size selected by the
+				 * analytic model.
+				 */
+				size_t getTileSize() const noexcept;
+
+				/**
+				 * A getter function that returns the number of tiles.
+				 */
+				size_t getNumTiles() const noexcept;
+
+		};
+
+	}
+}
+
+#endif
+
diff --git a/include/graphblas/nonblocking/benchmark.hpp b/include/graphblas/nonblocking/benchmark.hpp
new file mode 100644
index 000000000..8b62cb016
--- /dev/null
+++ b/include/graphblas/nonblocking/benchmark.hpp
@@ -0,0 +1,95 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Nonblocking implementation of the benchmarker.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BENCH
+#define _H_GRB_NONBLOCKING_BENCH
+
+#include <graphblas/base/benchmark.hpp>
+#include <graphblas/rc.hpp>
+
+#include "exec.hpp"
+
+
+namespace grb {
+
+	/**
+	 * The Benchmarker class is based on that of the reference backend
+	 *
+	 * \internal The public API simply wraps the reference Benchmarker.
+	 */
+	template< enum EXEC_MODE mode >
+	class Benchmarker< mode, nonblocking > {
+
+		private:
+
+			/** \internal Reuse reference benchmarker. */
+			Benchmarker< mode, reference > ref;
+
+
+		public:
+
+			/** \internal Mirror reference constructor. */
+			Benchmarker(
+				size_t process_id = 0,
+				size_t nprocs = 1,
+				std::string hostname = "localhost",
+				std::string port = "0"
+			) :
+				ref(process_id, nprocs, hostname, port)
+			{}
+
+			/** \internal Mirror reference exec. */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out,
+				const size_t inner, const size_t outer,
+				const bool broadcast = false
+			) const {
+				return ref.exec(
+					grb_program, data_in, in_size, data_out, inner, outer, broadcast
+				);
+			}
+
+			/** \internal Mirror reference exec. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const size_t inner,
+				const size_t outer,
+				const bool broadcast = false
+			) {
+				return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast );
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_BENCH''
+
diff --git a/include/graphblas/nonblocking/blas1.hpp b/include/graphblas/nonblocking/blas1.hpp
new file mode 100644
index 000000000..f9f14cafc
--- /dev/null
+++ b/include/graphblas/nonblocking/blas1.hpp
@@ -0,0 +1,11489 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Level-1 primitive implementation for nonblocking.
+ *
+ * \internal
+ * \todo Relies significantly on a past reference level-1 implementation. Can we
+ *       reuse?
+ * \endinternal
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS1
+#define _H_GRB_NONBLOCKING_BLAS1
+
+#include <iostream>    //for printing to stderr
+#include <type_traits> //for std::enable_if
+
+#include <omp.h>
+
+#include <graphblas/utils/suppressions.h>
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "vector_wrapper.hpp"
+#include "boolean_dispatcher_blas1.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value that matches the expected type.\n" \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z )                                           \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the operator domains, as specified in the "            \
+		"documentation of the function " y ", supply an input argument of "    \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible operator where all domains "  \
+		"match those of the input parameters, as specified in the "            \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	/**
+	 * \defgroup BLAS1_NB The Level-1 ALP/GraphBLAS routines -- nonblocking backend
+	 *
+	 * @{
+	 */
+
+	namespace internal {
+
+		template<
+			bool left,
+			class Monoid,
+			typename InputType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_dense(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Monoid &monoid
+		) {
+			const InputType *__restrict__ const raw = internal::getRaw( to_fold );
+
+			const size_t start = lower_bound;
+			const size_t end = upper_bound;
+
+			if( start < end ) {
+				if( left ) {
+					monoid.getOperator().foldlArray(
+						thread_local_output, raw + start, end - start );
+				} else {
+					monoid.getOperator().foldrArray(
+						raw + start, thread_local_output, end - start );
+				}
+			}
+			return SUCCESS;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_vectorDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_to_fold_nz = ( already_dense_input_to_fold )
+				? local_n
+				: local_to_fold.nonzeroes();
+
+			assert( n > 0 );
+			assert( !masked || internal::getCoordinates( mask ).size() == n );
+
+#ifdef NDEBUG
+			(void) n;
+			(void) local_n;
+#endif
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = local_to_fold_nz;
+
+			// compute thread-local partial reduction
+			for( size_t k = start; k < end; ++k ) {
+				const size_t i = ( (already_dense_input_to_fold)
+					? k
+					: local_to_fold.index( k ) ) + lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !utils::interpretMask< descr >(
+							internal::getCoordinates( mask ).assigned( i ),
+							internal::getRaw( mask ), i )
+						) {
+							continue;
+						}
+					} else {
+						if( !utils::interpretMask< descr >(
+							local_mask.assigned( i - lower_bound ), internal::getRaw( mask ), i )
+						) {
+							continue;
+						}
+					}
+				}
+				RC local_rc;
+				if( left ) {
+					local_rc = foldl< descr >( thread_local_output,
+						internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+				} else {
+					local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+						thread_local_output, monoid.getOperator() );
+				}
+				assert( local_rc == SUCCESS );
+				if( local_rc != SUCCESS ) {
+					ret = local_rc;
+				}
+			}
+
+			return ret;
+		}
+
+		template<
+			Descriptor descr,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			const size_t n = internal::getCoordinates( to_fold ).size();
+
+			assert( internal::getCoordinates( mask ).size() == n );
+			assert( n > 0 );
+#ifdef NDEBUG
+			(void) n;
+#endif
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = ( already_dense_mask )
+				? local_n
+				: local_mask.nonzeroes();
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = local_mask_nz;
+
+			// compute thread-local partial reduction
+			for( size_t k = start; k < end; ++k ) {
+				const size_t i = ( (already_dense_mask)
+						? k
+						: local_mask.index( k )
+					) + lower_bound;
+				if( !( already_dense_input_to_fold ||
+					local_to_fold.assigned( i - lower_bound ) )
+				) {
+					continue;
+				}
+				if( !utils::interpretMask< descr >( true, internal::getRaw( mask ), i ) ) {
+					continue;
+				}
+				RC local_rc;
+				if( left ) {
+					local_rc = foldl< descr >( thread_local_output,
+						internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+				} else {
+					local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+						thread_local_output, monoid.getOperator() );
+				}
+				assert( local_rc == SUCCESS );
+				if( local_rc != SUCCESS ) {
+					ret = local_rc;
+				}
+			}
+
+			return ret;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_fullLoopSparse(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+#ifdef _DEBUG
+			std::cout << "Entered fold_from_vector_to_scalar_fullLoopSparse\n";
+#endif
+
+#ifndef NDEBUG
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t local_n = already_dense_input_to_fold
+				? upper_bound - lower_bound
+				: local_to_fold.size();
+			assert( local_n > 0 );
+
+			(void) n;
+#endif
+			RC ret = SUCCESS;
+
+			size_t i = lower_bound;
+			const size_t end = upper_bound;
+
+			// some sanity checks
+			assert( i <= end );
+			assert( end <= n );
+
+			// assume current i needs to be processed, forward until we find an index
+			// for which the mask evaluates true
+			bool process_current_i = true;
+			if( masked && i < end ) {
+				process_current_i = utils::interpretMask< descr >(
+					already_dense_mask
+						? internal::getCoordinates( mask ).assigned( i )
+						: local_mask.assigned( i - lower_bound ),
+					internal::getRaw( mask ), i ) && (
+						already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+					);
+				// if not
+				while( !process_current_i ) {
+					// forward to next element
+					(void) ++i;
+					// check that we are within bounds
+					if( i == end ) {
+						break;
+					}
+					// evaluate whether we should process this i-th element
+					process_current_i = utils::interpretMask< descr >(
+						already_dense_mask
+							? internal::getCoordinates( mask ).assigned( i )
+							: local_mask.assigned( i - lower_bound ),
+						internal::getRaw( mask ), i ) && (
+							already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+						);
+				}
+			}
+
+			if( !masked && i < end ) {
+				process_current_i = local_to_fold.assigned( i - lower_bound );
+				while( !process_current_i ) {
+					(void) ++i;
+					if( i == end ) {
+						break;
+					}
+					process_current_i = already_dense_input_to_fold ||
+						local_to_fold.assigned( i - lower_bound );
+				}
+			}
+
+#ifndef NDEBUG
+			if( i < end ) {
+				assert( i < n );
+			}
+#endif
+
+			// declare thread-local variable and set our variable to the first value in
+			// our block
+			typename Monoid::D3 local =
+				monoid.template getIdentity< typename Monoid::D3 >();
+			if( end > 0 ) {
+				if( i < end ) {
+#ifdef _DEBUG
+					std::cout << "\t processing start index " << i << "\n";
+#endif
+					local = static_cast< typename Monoid::D3 >(
+						internal::getRaw( to_fold )[ i ] );
+				}
+			}
+
+			// if we have more values to fold
+			if( i + 1 < end ) {
+
+				// keep going until we run out of values to fold
+				while( true ) {
+
+					// forward to next variable
+					(void) ++i;
+
+					// forward more (possibly) if in the masked case
+					if( masked && i < end ) {
+						assert( i < n );
+						process_current_i = utils::interpretMask< descr >(
+								already_dense_mask
+									? internal::getCoordinates( mask ).assigned( i )
+									: local_mask.assigned( i - lower_bound ),
+								internal::getRaw( mask ), i
+							) && (
+								already_dense_input_to_fold ||
+								local_to_fold.assigned( i - lower_bound )
+							);
+						while( !process_current_i ) {
+							(void) ++i;
+							if( i == end ) {
+								break;
+							}
+							assert( i < end );
+							assert( i < n );
+							process_current_i = utils::interpretMask< descr >(
+									already_dense_mask
+										? internal::getCoordinates( mask ).assigned( i )
+										: local_mask.assigned( i - lower_bound ),
+									internal::getRaw( mask ), i
+								) && (
+									already_dense_input_to_fold ||
+									local_to_fold.assigned( i - lower_bound )
+								);
+						}
+					}
+					if( !masked && i < end ) {
+						assert( i < n );
+						process_current_i = already_dense_input_to_fold ||
+							local_to_fold.assigned( i - lower_bound );
+						while( !process_current_i ) {
+							(void) ++i;
+							if( i == end ) {
+								break;
+							}
+							assert( i < end );
+							assert( i < n );
+							process_current_i = already_dense_input_to_fold ||
+								local_to_fold.assigned( i - lower_bound );
+						}
+					}
+
+					// stop if past end
+					if( i >= end ) {
+						break;
+					}
+
+#ifdef _DEBUG
+					std::cout << "\t processing index " << i << "\n";
+#endif
+
+					// do fold
+					assert( i < n );
+					if( left ) {
+						ret = ret ? ret : foldl< descr >( local, internal::getRaw( to_fold )[ i ],
+							monoid.getOperator() );
+					} else {
+						ret = ret ? ret : foldr< descr >( internal::getRaw( to_fold )[ i ], local,
+							monoid.getOperator() );
+					}
+					assert( ret == SUCCESS );
+
+					if( ret != SUCCESS ) {
+						break;
+					}
+				}
+			}
+
+			if( left ) {
+				ret = ret ? ret : foldl< descr >( thread_local_output, local,
+					monoid.getOperator() );
+			} else {
+				ret = ret ? ret : foldr< descr >( local, thread_local_output,
+					monoid.getOperator() );
+			}
+			assert( ret == SUCCESS );
+
+			return ret;
+		}
+
+		/**
+		 * Dispatches to any of the four above variants depending on asymptotic cost
+		 * analysis.
+		 */
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool masked,
+			bool left, // if this is false, assumes right-looking fold
+			class Monoid,
+			typename IOType,
+			typename InputType,
+			typename MaskType,
+			typename Coords
+		>
+		RC fold_from_vector_to_scalar_generic(
+			IOType &fold_into,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			// static sanity checks
+			static_assert( grb::is_monoid< Monoid >::value,
+				"grb::foldl can only be called using monoids. This "
+				"function should not have been called-- please submit a "
+				"bugreport." );
+
+			const size_t n = internal::getCoordinates( to_fold ).size();
+
+			// mask must be of equal size as input vector
+			if( masked && n != size( mask ) ) {
+				return MISMATCH;
+			}
+
+			// handle trivial cases
+			if( n == 0 ) {
+				return SUCCESS;
+			}
+
+			// some globals used during the folding
+			RC ret = SUCCESS;
+			typename Monoid::D3 global =
+				monoid.template getIdentity< typename Monoid::D3 >();
+
+			size_t local_reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+				config::CACHE_LINE_SIZE::value();
+			IOType local_reduced[ local_reduced_size ];
+
+			for(
+				size_t i = 0;
+				i < local_reduced_size;
+				i += config::CACHE_LINE_SIZE::value()
+			) {
+				local_reduced[ i ] = monoid.template getIdentity< typename Monoid::D3 >();
+			}
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func =
+				[&to_fold, &mask, &monoid, &local_reduced] (
+					internal::Pipeline &pipeline,
+					const size_t lower_bound,
+					const size_t upper_bound
+				) {
+#ifdef _NONBLOCKING_DEBUG
+					#pragma omp critical
+					std::cout << "\t\tExecution of stage fold_from_vector_to_scalar_generic "
+						"in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+					RC ret = SUCCESS;
+
+					Coords local_to_fold, local_mask;
+					size_t local_n = upper_bound - lower_bound;
+					size_t local_to_fold_nz = local_n;
+					size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					const bool already_dense_vectors = dense_descr ||
+						pipeline.allAlreadyDenseVectors();
+#else
+					(void) pipeline;
+					constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+					bool already_dense_input_to_fold = true;
+					bool already_dense_mask = true;
+
+					if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_input_to_fold = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( to_fold ) );
+						if( !already_dense_input_to_fold ) {
+#else
+							already_dense_input_to_fold = false;
+#endif
+							local_to_fold = internal::getCoordinates( to_fold ).asyncSubset(
+								lower_bound, upper_bound );
+							local_to_fold_nz = local_to_fold.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+						if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_mask = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( mask ) );
+							if( !already_dense_mask ) {
+#else
+								already_dense_mask = false;
+#endif
+								local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+									upper_bound );
+								local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+					}
+
+					unsigned int thread_id = omp_get_thread_num() *
+						config::CACHE_LINE_SIZE::value();
+
+					// dispatch, dense variant
+					if( ( (descr & descriptors::dense) || local_to_fold_nz == local_n ) && (
+							!masked || (
+								(descr & descriptors::structural) &&
+								!(descr & descriptors::invert_mask) &&
+								local_mask_nz == local_n
+							)
+						)
+					) {
+#ifdef _DEBUG
+						std::cout << "\t dispatching to dense variant\n";
+#endif
+						ret = fold_from_vector_to_scalar_dense< left >(
+							local_reduced[ thread_id ], lower_bound, upper_bound, to_fold, monoid );
+					} else if( masked && (descr & descriptors::invert_mask ) ) {
+						// in this case we are forced to dispatch to O(n)
+#ifdef _DEBUG
+						std::cout << "\t forced dispatch to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+						ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+						ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+								descr, true, left
+							>(
+								already_dense_input_to_fold, already_dense_mask,
+								local_reduced[ thread_id ], lower_bound, upper_bound,
+								local_to_fold, local_mask, to_fold, mask, monoid
+							);
+					} else {
+						constexpr const size_t threeWs =
+							sizeof( typename Coords::StackType ) +
+							sizeof( typename Coords::ArrayType ) +
+							MaskWordSize< descr, MaskType >::value;
+						const size_t fullLoop = masked
+							? 2 * sizeof( typename Coords::ArrayType ) * local_n +
+								sizeof( MaskType ) * local_mask_nz
+							: sizeof( typename Coords::ArrayType ) * local_n;
+						const size_t vectorLoop = masked
+							? threeWs * local_to_fold_nz
+							: sizeof( typename Coords::StackType ) * local_to_fold_nz;
+						const size_t maskLoop = masked
+							? threeWs * local_mask_nz
+							: std::numeric_limits< size_t >::max();
+						if( fullLoop >= vectorLoop && maskLoop >= vectorLoop ) {
+#ifdef _DEBUG
+							std::cout << "\t dispatching to vector-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven<
+#else
+							ret = fold_from_vector_to_scalar_vectorDriven<
+#endif
+									descr, masked, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						} else if( vectorLoop >= fullLoop && maskLoop >= fullLoop ) {
+#ifdef _DEBUG
+							std::cout << "\t dispatching to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+							ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+									descr, masked, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						} else {
+							assert( maskLoop < fullLoop && maskLoop < vectorLoop );
+							assert( masked );
+#ifdef _DEBUG
+							std::cout << "\t dispatching to mask-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							ret = boolean_dispatcher_fold_from_vector_to_scalar_maskDriven<
+#else
+							ret = fold_from_vector_to_scalar_maskDriven<
+#endif
+									descr, left
+								>(
+									already_dense_input_to_fold, already_dense_mask,
+									local_reduced[ thread_id ], lower_bound, upper_bound,
+									local_to_fold, local_mask, to_fold, mask, monoid
+								);
+						}
+					}
+
+					return ret;
+				};
+
+#ifdef _NONBLOCKING_DEBUG
+			std::cout << "\t\tStage added to a pipeline: "
+				<< "fold_from_vector_to_scalar_generic" << std::endl;
+#endif
+
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS1_FOLD_VECTOR_SCALAR_GENERIC,
+					n,
+					sizeof( IOType ),
+					dense_descr,
+					true,
+					nullptr, nullptr, nullptr, nullptr,
+					&to_fold,
+					( masked ) ? &mask : nullptr,
+					nullptr,
+					nullptr,
+					&internal::getCoordinates( to_fold ),
+					(masked) ? &internal::getCoordinates( mask ) : nullptr,
+					nullptr,
+					nullptr,
+					nullptr
+				);
+
+			if( ret == SUCCESS ) {
+				for(
+					size_t i = 0;
+					i < local_reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					RC rc;
+					if( left ) {
+						rc = foldl< descr >( global, local_reduced[ i ], monoid.getOperator() );
+					} else {
+						rc = foldr< descr >( local_reduced[ i ], global, monoid.getOperator() );
+					}
+					assert( rc == SUCCESS );
+					if( rc != SUCCESS ) {
+						ret = rc;
+					}
+				}
+			}
+
+			// accumulate
+#ifdef _DEBUG
+			std::cout << "\t accumulating " << global << " into " << fold_into << "\n";
+#endif
+
+			if( ret == SUCCESS ) {
+				if( left ) {
+					ret = foldl< descr >( fold_into, global, monoid.getOperator() );
+				} else {
+					ret = foldr< descr >( global, fold_into, monoid.getOperator() );
+				}
+			}
+
+			return ret;
+		}
+
+		/**
+		 * \internal
+		 * @tparam left   If false, right-looking fold is assumed (and left-looking
+		 *                otherwise)
+		 * @tparam sparse Whether \a vector was sparse
+		 * @tparam monoid Whether \a op is actually a monoid
+		 * \endinternal
+		 */
+		template<
+			Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+#endif
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_scalar_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, nonblocking, Coords > &vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		) {
+			constexpr const bool dense_descr = descr & descriptors::dense;
+			assert( !masked || mask != nullptr );
+			assert( !masked || local_mask_ptr != nullptr );
+
+			Coords local_mask;
+			if( masked ) {
+				local_mask = *local_mask_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_vector_nz = (sparse || !already_dense_output)
+				? local_vector.nonzeroes() : local_n;
+			const size_t local_mask_nz = ( masked )
+				? ( ( already_dense_mask )
+						? local_n
+						: local_mask.nonzeroes()
+					)
+				: 0;
+
+			const size_t n = internal::getCoordinates( vector ).size();
+
+			if( masked && internal::getCoordinates( *mask ).size() != n ) {
+				return MISMATCH;
+			}
+			if( dense_descr && sparse ) {
+				return ILLEGAL;
+			}
+			if( n == 0 ) {
+				return SUCCESS;
+			}
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			assert( phase == EXECUTE );
+			IOType * __restrict__ const x = internal::getRaw( vector );
+			const MaskType * __restrict__ const m = ( masked )
+				? internal::getRaw( *mask )
+				: nullptr;
+
+			if( sparse && monoid && !masked ) {
+				for( size_t i = lower_bound; i < upper_bound; ++i ) {
+					if( already_dense_output || local_vector.assigned( i - lower_bound ) ) {
+						if( left ) {
+							(void) foldl< descr >( x[ i ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ i ], op );
+						}
+					} else {
+						x[ i ] = static_cast< IOType >( scalar );
+					}
+				}
+
+				if( !already_dense_output ) {
+					local_vector.local_assignAllNotAlreadyAssigned();
+				}
+			} else if( sparse && monoid && masked ) {
+				for( size_t i = 0; i < local_mask_nz; ++i ) {
+					const size_t index = ( ( already_dense_mask )
+						? i
+						: local_mask.index( i ) ) + lower_bound;
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *mask ).template mask< descr >(
+							index, m )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound,
+							m + lower_bound )
+						) {
+							continue;
+						}
+					}
+					if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+						if( left ) {
+							(void) foldl< descr >( x[ index ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ index ], op );
+						}
+					} else {
+						x[ index ] = static_cast< IOType >( scalar );
+					}
+				}
+			} else if( sparse && !monoid ) {
+				const bool maskDriven = masked ? local_mask_nz < local_vector_nz : false;
+				if( maskDriven ) {
+					for( size_t i = 0; i < local_mask_nz; ++i ) {
+						const size_t index = ( ( already_dense_mask )
+							? i
+							: local_mask.index( i ) ) + lower_bound;
+						if( already_dense_mask ) {
+							if( !internal::getCoordinates( *mask ).template mask< descr >(
+								index, m )
+							) {
+								continue;
+							}
+						} else {
+							if( !local_mask.template mask< descr >( index - lower_bound,
+								m + lower_bound )
+							) {
+								continue;
+							}
+						}
+						if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+							if( left ) {
+								(void) foldl< descr >( x[ index ], scalar, op );
+							} else {
+								(void) foldr< descr >( scalar, x[ index ], op );
+							}
+						}
+					}
+				} else {
+					for( size_t i = 0; i < local_vector_nz; ++i ) {
+						const size_t index = (already_dense_output
+								? i
+								: local_vector.index( i )
+							) + lower_bound;
+						if( masked ) {
+							if( already_dense_mask ) {
+								if( !( internal::getCoordinates( *mask ).template mask< descr >(
+									index, m ) )
+								) {
+									continue;
+								}
+							} else {
+								if( !local_mask.template mask< descr >( index - lower_bound, m +
+									lower_bound )
+								) {
+									continue;
+								}
+							}
+						}
+						if( left ) {
+							(void) foldl< descr >( x[ index ], scalar, op );
+						} else {
+							(void) foldr< descr >( scalar, x[ index ], op );
+						}
+					}
+				}
+			} else if( !sparse && masked ) {
+				for( size_t i = 0; i < local_mask_nz; ++i ) {
+					const size_t index = ( ( already_dense_mask )
+						? i
+						: local_mask.index( i ) ) + lower_bound;
+					if( already_dense_mask ) {
+						if( !( internal::getCoordinates( *mask ).template mask< descr >(
+							index, m ) )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound, m +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+
+					if( left ) {
+						(void) foldl< descr >( x[ index ], scalar, op );
+					} else {
+						(void) foldr< descr >( scalar, x[ index ], op );
+					}
+				}
+			} else {
+				// if target vector is dense and there is no mask, then
+				// there is no difference between monoid or non-monoid behaviour.
+				assert( !sparse );
+				assert( !masked );
+				assert( local_vector_nz == local_n );
+
+				if( local_n > 0 ) {
+					if( left ) {
+						op.eWiseFoldlAS( x + lower_bound, scalar, local_n );
+					} else {
+						op.eWiseFoldrSA( scalar, x + lower_bound, local_n );
+					}
+				}
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			Descriptor descr,
+			bool left, // if this is false, the right-looking fold is assumed
+			bool sparse,
+			bool masked,
+			bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_vector_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, nonblocking, Coords > &fold_into,
+			const Vector< MaskType, nonblocking, Coords > * const m,
+			const Vector< IType, nonblocking, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		) {
+			constexpr const bool dense_descr = descr & descriptors::dense;
+			assert( !masked || (m != nullptr) );
+
+			Coords local_m;
+			if( masked && !already_dense_mask ) {
+				local_m = *local_m_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_fold_into_nz = already_dense_output
+				? local_n
+				: local_fold_into.nonzeroes();
+			const size_t local_to_fold_nz = already_dense_input_to_fold
+				? local_n
+				: local_to_fold.nonzeroes();
+			const size_t local_m_nz = ( masked )
+				? ( already_dense_mask
+						? local_n
+						: local_m.nonzeroes()
+					)
+				: 0;
+
+			const size_t n = size( fold_into );
+			if( n != size( to_fold ) ) {
+				return MISMATCH;
+			}
+			if( masked && size( *m ) != n ) {
+				return MISMATCH;
+			}
+			if( dense_descr && sparse ) {
+				return ILLEGAL;
+			}
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			assert( phase == EXECUTE );
+
+			if( !sparse && !masked ) {
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in dense variant\n";
+#endif
+
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in sequential variant\n";
+#endif
+				if( left ) {
+					op.eWiseFoldlAA( internal::getRaw( fold_into ) + lower_bound,
+						internal::getRaw( to_fold ) + lower_bound, local_n );
+				} else {
+					op.eWiseFoldrAA( internal::getRaw( to_fold ) + lower_bound,
+						internal::getRaw( fold_into ) + lower_bound, local_n );
+				}
+			} else {
+#ifdef _DEBUG
+				std::cout << "fold_from_vector_to_vector_generic: in sparse variant\n";
+				std::cout << "\tfolding vector of " << local_to_fold_nz << " nonzeroes "
+					<< "into a vector of " << local_fold_into_nz << " nonzeroes...\n";
+#endif
+				if(
+					masked &&
+					local_fold_into_nz == local_n &&
+					local_to_fold_nz == local_n
+				) {
+					// use sparsity structure of mask for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+							<< "mask's sparsity structure\n";
+#endif
+						for( size_t k = 0; k < local_m_nz; ++k ) {
+							const size_t i = ( already_dense_mask
+									? k
+									: local_m.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Left-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+							<< "mask's sparsity structure\n";
+#endif
+						for( size_t k = 0; k < local_m_nz; ++k ) {
+							const size_t i = ( already_dense_mask
+									? k
+									: local_m.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else if( !masked && local_fold_into_nz == local_n ) {
+					// use sparsity structure of to_fold for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "to_fold's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+							const size_t i = ( already_dense_input_to_fold
+									? k
+									: local_to_fold.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+								std::cout << "Left-folding " << to_fold[ i ] << " into "
+									<< fold_into[ i ];
+#endif
+								(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+								std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "to_fold's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+							const size_t i = ( already_dense_input_to_fold
+									? k
+									: local_to_fold.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else if( !masked && local_to_fold_nz == local_n ) {
+					// use sparsity structure of fold_into for this eWiseFold
+					if( left ) {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+							<< "fold_into's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+							const size_t i = ( already_dense_output
+									? k
+									: local_fold_into.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Left-folding " << to_fold[ i ] << " into "
+								<< fold_into[ i ];
+#endif
+							(void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					} else {
+#ifdef _DEBUG
+						std::cout << "fold_from_vector_to_vector_generic: foldr, using "
+							<< "fold_into's sparsity\n";
+#endif
+						for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+							const size_t i = ( already_dense_output ?
+									k :
+									local_fold_into.index( k )
+								) + lower_bound;
+#ifdef _DEBUG
+							std::cout << "Right-folding " << to_fold[ i ] << " into " << fold_into[ i ];
+#endif
+							(void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+							std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+						}
+					}
+				} else {
+#ifdef _DEBUG
+					std::cout << "fold_from_vector_to_vector_generic: using specialised "
+						<< "code to merge two sparse vectors and, potentially, "
+						<< "output masks\n";
+#endif
+					const IType * __restrict__ const tf_raw = internal::getRaw( to_fold );
+					IOType * __restrict__ const fi_raw = internal::getRaw( fold_into );
+#ifdef _DEBUG
+					std::cout << "\tin sequential version...\n";
+#endif
+					for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+						const size_t i = ( already_dense_input_to_fold
+								? k
+								: local_to_fold.index( k )
+							) + lower_bound;
+						if( masked ) {
+							if( already_dense_mask ) {
+								if( !internal::getCoordinates( *m ).template mask< descr >( i,
+									internal::getRaw( *m ) )
+								) {
+									continue;
+								}
+							} else {
+								if( !local_m.template mask< descr >( i - lower_bound,
+									internal::getRaw( *m ) + lower_bound )
+								) {
+									continue;
+								}
+							}
+						}
+
+						assert( i < n );
+						if( already_dense_output ||
+							local_fold_into.assigned( i - lower_bound )
+						) {
+							if( left ) {
+#ifdef _DEBUG
+								std::cout << "\tfoldl< descr >( fi_raw[ i ], tf_raw[ i ], op ), i = "
+									<< i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+								(void)foldl< descr >( fi_raw[ i ], tf_raw[ i ], op );
+#ifdef _DEBUG
+								std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+							} else {
+#ifdef _DEBUG
+								std::cout << "\tfoldr< descr >( tf_raw[ i ], fi_raw[ i ], op ), i = "
+									<< i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+								(void) foldr< descr >( tf_raw[ i ], fi_raw[ i ], op );
+#ifdef _DEBUG
+								std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+							}
+						} else if( monoid ) {
+#ifdef _DEBUG
+							std::cout << "\tindex " << i << " is unset. Old value " << fi_raw[ i ]
+								<< " will be overwritten with " << tf_raw[ i ] << "\n";
+#endif
+							fi_raw[ i ] = tf_raw[ i ];
+							(void) local_fold_into.assign( i - lower_bound );
+						}
+					}
+				}
+			}
+
+#ifdef _DEBUG
+			std::cout << "\tCall to fold_from_vector_to_vector_generic done. "
+				<< "Output now contains " << local_fold_into_nz << " / "
+				<< local_n << " nonzeroes.\n";
+#endif
+			return SUCCESS;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename MaskType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				!grb::is_object< MaskType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldr",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldr",
+			"called with a vector mask type that is not boolean" );
+
+		if( size( mask ) > 0 ) {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, true, false
+				>( beta, x, mask, monoid );
+		} else {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, false, false
+				>( beta, x, mask, monoid );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		IOType &beta,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldr",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+		Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return internal::template fold_from_vector_to_scalar_generic<
+				descr, false, false
+			>( beta, x, empty_mask, monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[alpha, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(alpha, y, monoid) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_y;
+				size_t local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+					const size_t local_n = upper_bound - lower_bound;
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector( &internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, true, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, false, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( y ).size(),
+				sizeof( IOType ),
+				dense_descr, true,
+				&y, nullptr,
+				&internal::getCoordinates( y ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const InputType &alpha,
+		Vector< IOType, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[alpha, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				{
+					std::cout << "\t\tExecution of stage foldl(alpha, y, op) in the range("
+						<< lower_bound << ", " << upper_bound << ")" << std::endl;
+				}
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, true, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, y, null_mask,
+							alpha, op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, false, false, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound, local_y, local_null_mask,
+							y, null_mask, alpha, op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+			std::move( func ),
+			internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+			internal::getCoordinates( y ).size(),
+			sizeof( IOType ),
+			dense_descr, true,
+			&y, nullptr,
+			&internal::getCoordinates( y ), nullptr,
+			nullptr, nullptr, nullptr, nullptr,
+			nullptr, nullptr, nullptr, nullptr,
+			nullptr
+		);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, op)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		Vector< IOType, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				grb::is_operator< OP >::value &&
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+#ifdef _DEBUG
+		std::cout << "In foldr ([T]<-[T])\n";
+#endif
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, y, operator) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask,
+							local_x, y,
+							null_mask, x,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask,
+							local_x,
+							y, null_mask,
+							x,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&y, nullptr,
+				&internal::getCoordinates( y ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		Vector< IOType, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< IOType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+			"called with a non-Boolean mask" );
+
+		if( size( m ) == 0 ) {
+			return foldr< descr >( x, y, op, phase );
+		}
+
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, m, y, operator) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, &m, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		Vector< IOType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+				!grb::is_object< InputType >::value &&
+				!grb::is_object< IOType >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given monoid" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, local_x,
+							y, null_mask, x,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_y, local_null_mask, local_x,
+							y, null_mask, x,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldr(x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldr(
+		const Vector< InputType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		Vector< IOType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< IOType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+			"called on a vector y of a type that does not match the third domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+			"called with a mask of non-Boolean type" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return foldr< descr >( x, y, monoid, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldr(x, m, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, true, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, false, false, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_y, &local_m, local_x,
+							y, &m, x,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&y, nullptr, &internal::getCoordinates( y ), nullptr,
+				&x, &m, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+			std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, monoid)"
+				<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Op,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, beta, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, beta, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, false, false
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+				rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+						descr, true, false, false, false
+					>(
+						already_dense_output, true,
+						lower_bound, upper_bound,
+						local_x, local_null_mask,
+						x, null_mask, beta,
+						op, phase
+					);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr,
+				&internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, beta, op)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Op,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType beta,
+		const Op &op = Op(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_operator< Op >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting ) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::foldl (reference, vector <- scalar, masked)",
+			"provided mask does not have boolean entries" );
+
+		// check empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, beta, op, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, beta, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, beta, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_mask;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, true, false
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_mask,
+							x, &m,
+							beta,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, true, false
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_mask,
+							x, &m,
+							beta,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, op)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const InputType beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given monoid" );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, beta, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, beta, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, false, true
+						>(
+							already_dense_output, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask,
+							x, null_mask,
+							beta,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType &beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< IOType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ),
+			"grb::foldl (nonblocking, vector <- scalar, masked, monoid)",
+			"provided mask does not have boolean entries" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, beta, monoid, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( n != size( m ) ) { return MISMATCH; }
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, beta, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, beta, monoid) in the "
+					<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, true, true, true
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m,
+							x, &m,
+							beta,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+					rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+							descr, true, false, true, true
+						>(
+							already_dense_output, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m,
+							x, &m,
+							beta,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+				internal::getCoordinates( x ).size(), sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( (!( descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, y, operator) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							op, phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, false, false
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_x, local_y;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz, local_y_nz;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							monoid.getOperator(), phase
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, false, true
+						>(
+							already_dense_output, already_dense_input, true,
+							lower_bound, upper_bound,
+							local_x, local_null_mask, local_y,
+							x, null_mask, y,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_operator< OP >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a mask that does not have boolean entries " );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, op, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &op, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, y, op) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_y, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				size_t local_y_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							op, phase
+						);
+				} else {
+					assert( local_x_nz == local_n );
+					assert( local_y_nz == local_n );
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, true, false
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							op, phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, &m, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( m ), nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, op)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		Vector< IOType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value, void
+		>::type * = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+			"called with a vector x of a type that does not match the first domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+			"called on a vector y of a type that does not match the second domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+			"called on a vector x of a type that does not match the third domain "
+			"of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a mask that does not have boolean entries" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return foldl< descr >( x, y, monoid, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( x );
+		if( n != size( y ) || n != size( m ) ) {
+			return MISMATCH;
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&x, &m, &y, &monoid, phase] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage foldl(x, m, y, monoid) in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_x, local_y, local_m;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_x_nz = local_n;
+				size_t local_y_nz = local_n;
+				bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+				bool already_dense_output = true;
+				bool already_dense_input = true;
+				bool already_dense_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+						if( local_x_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( m ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( y ) );
+					if( !already_dense_input ) {
+#else
+						already_dense_input = false;
+#endif
+						local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+							upper_bound );
+						local_y_nz = local_y.nonzeroes();
+						if( local_y_nz < local_n ) {
+							sparse = true;
+						}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, true, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							monoid.getOperator(), phase
+						);
+				} else {
+					assert( local_x_nz == local_n );
+					assert( local_y_nz == local_n );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+					rc = internal::fold_from_vector_to_vector_generic<
+#endif
+							descr, true, false, true, true
+						>(
+							already_dense_output, already_dense_input, already_dense_mask,
+							lower_bound, upper_bound,
+							local_x, &local_m, local_y,
+							x, &m, y,
+							monoid.getOperator(), phase
+						);
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+				n, sizeof( IOType ),
+				dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, &m, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( m ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr, class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dense_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+#ifdef _DEBUG
+			std::cout << "\t internal::dense_apply_generic called\n";
+#endif
+			static_assert( !(left_scalar && left_sparse),
+				"The left-hand side must be scalar OR sparse, but cannot be both!" );
+			static_assert( !(right_scalar && right_sparse),
+				"The right-hand side must be scalar OR sparse, but cannot be both!" );
+			static_assert( !(left_sparse && right_sparse),
+				"If both left- and right-hand sides are sparse, use sparse_apply_generic "
+				"instead." );
+
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+
+			constexpr const size_t block_size = OP::blocksize;
+			const size_t num_blocks = local_n / block_size;
+
+#ifndef NDEBUG
+			const bool has_coda = local_n % block_size > 0;
+#endif
+			size_t i = 0 + lower_bound;
+			const size_t start = 0;
+			const size_t end = num_blocks;
+
+			// declare and initialise local buffers for SIMD
+			OutputType z_b[ block_size ];
+			InputType1 x_b[ block_size ];
+			InputType2 y_b[ block_size ];
+			bool x_m[ block_size ];
+			bool y_m[ block_size ];
+			for( size_t k = 0; k < block_size; ++k ) {
+				if( left_scalar ) {
+					x_b[ k ] = x_wrapper.getValue();
+				}
+				if( right_scalar ) {
+					y_b[ k ] = y_wrapper.getValue();
+				}
+			}
+
+			for( size_t block = start; block < end; ++block ) {
+				size_t local_i = i;
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( !left_scalar ) {
+						x_b[ k ] = x_p[ local_i ];
+					}
+					if( !right_scalar ) {
+						y_b[ k ] = y_p[ local_i ];
+					}
+					if( left_sparse ) {
+						x_m[ k ] = already_dense_input_x || local_x.assigned( local_i -
+							lower_bound );
+					}
+					if( right_sparse ) {
+						y_m[ k ] = already_dense_input_y || local_y.assigned( local_i -
+							lower_bound );
+					}
+					(void) ++local_i;
+				}
+				for( size_t k = 0; k < block_size; ++k ) {
+					RC rc = SUCCESS;
+					if( left_sparse && !x_m[ k ] ) {
+						z_b[ k ] = y_b[ k ]; // WARNING: assumes monoid semantics!
+					} else if( right_sparse && !y_m[ k ] ) {
+						z_b[ k ] = x_b[ k ]; // WARNING: assumes monoid semantics!
+					} else {
+						rc = apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+					}
+					assert( rc == SUCCESS );
+#ifdef NDEBUG
+					(void) rc;
+#endif
+				}
+				for( size_t k = 0; k < block_size; ++k, ++i ) {
+					z_p[ i ] = z_b[ k ];
+				}
+			}
+
+#ifndef NDEBUG
+			if( has_coda ) {
+				assert( i < local_n + lower_bound );
+			} else {
+				assert( i == local_n + lower_bound );
+			}
+#endif
+
+			i = end * block_size + lower_bound;
+			for( ; i < local_n + lower_bound; ++i ) {
+				RC rc = SUCCESS;
+				if( left_scalar && right_scalar ) {
+					rc = apply( z_p[ i ], x_wrapper.getValue(), y_wrapper.getValue(), op );
+				} else if( left_scalar && !right_scalar ) {
+					if( right_sparse && !( already_dense_input_y || local_y.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = x_wrapper.getValue();
+					} else {
+						rc = apply( z_p[ i ], x_wrapper.getValue(), y_p[ i ], op );
+					}
+				} else if( !left_scalar && right_scalar ) {
+					if( left_sparse && !( already_dense_input_x || local_x.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = y_wrapper.getValue();
+					} else {
+						rc = apply( z_p[ i ], x_p[ i ], y_wrapper.getValue(), op );
+					}
+				} else {
+					assert( !left_scalar && !right_scalar );
+					if( left_sparse && !(already_dense_input_x || local_x.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = y_p[ i ];
+					} else if( right_sparse && !(already_dense_input_y || local_y.assigned( i -
+						lower_bound ) )
+					) {
+						z_p[ i ] = x_p[ i ];
+					} else {
+						assert( !left_sparse && !right_sparse );
+						rc = apply( z_p[ i ], x_p[ i ], y_p[ i ], op );
+					}
+				}
+				assert( rc == SUCCESS );
+#ifdef NDEBUG
+				(void) rc;
+#endif
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+#ifndef GRB_NO_NOOP_CHECKS
+			static_assert( !internal::maybe_noop< OP >::value, "Warning: you may be "
+				"generating an output vector with uninitialised values. Define "
+				"the GRB_NO_NOOP_CHECKS macro to disable this check.\n" );
+#endif
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const mask_p = ( masked )
+				? internal::getRaw( *mask_vector )
+				: nullptr;
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			Coords local_mask;
+			if( masked ) {
+				local_mask = *local_mask_ptr;
+			}
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_x_nz = already_dense_input_x
+				? local_n
+				: local_x.nonzeroes();
+			const size_t local_y_nz = already_dense_input_y
+				? local_n
+				: local_y.nonzeroes();
+
+			// assertions
+			assert( !masked || local_mask_ptr != nullptr );
+			assert( !masked || local_mask_ptr->size() == local_n );
+			assert( x_scalar || local_x_nz <= local_n );
+			assert( y_scalar || local_y_nz <= local_n );
+
+#ifdef _DEBUG
+			std::cout << "\tinternal::sparse_apply_generic called\n";
+#endif
+			constexpr const size_t block_size = OP::blocksize;
+
+			// swap so that we do the expensive pass over the container with the fewest
+			// nonzeroes first
+			assert( !x_scalar || !y_scalar );
+			const bool swap = ( ( x_scalar || already_dense_input_x )
+					? local_n
+					: local_x_nz
+				) > ( ( y_scalar || already_dense_input_y )
+					? local_n
+					: local_y_nz
+				);
+			const Coordinates< nonblocking > &loop_coors = swap ? local_y : local_x;
+			const Coordinates< nonblocking > &chk_coors = swap ? local_x : local_y;
+			const bool already_dense_loop = swap
+				? already_dense_input_y
+				: already_dense_input_x;
+			const bool already_dense_chk = swap
+				? already_dense_input_x
+				: already_dense_input_y;
+
+			const size_t loop_coors_nz = swap ? local_y_nz : local_x_nz;
+			const size_t chk_coors_nz = swap ? local_x_nz : local_y_nz;
+#ifdef _DEBUG
+			std::cout << "\t\tfirst-phase loop of size " << loop_coors.size() << "\n";
+			if( x_scalar || y_scalar ) {
+				std::cout << "\t\tthere will be no second phase because one of the inputs "
+					<< "is scalar\n";
+			} else {
+				std::cout << "\t\tsecond-phase loop of size " << chk_coors.size() << "\n";
+			}
+#endif
+			// declare buffers for vectorisation
+			size_t offsets[ block_size ];
+			OutputType z_b[ block_size ];
+			InputType1 x_b[ block_size ];
+			InputType2 y_b[ block_size ];
+			bool mask[ block_size ];
+			bool x_m[ block_size ];
+			bool y_m[ block_size ];
+
+			if( x_scalar ) {
+				for( size_t k = 0; k < block_size; ++k ) {
+					x_b[ k ] = x_wrapper.getValue();
+				}
+			}
+			if( y_scalar ) {
+				for( size_t k = 0; k < block_size; ++k ) {
+					y_b[ k ] = y_wrapper.getValue();
+				}
+			}
+
+			// expensive pass #1
+			size_t start = 0;
+			size_t end = loop_coors_nz / block_size;
+			size_t k = 0;
+			for( size_t b = start; b < end; ++b ) {
+				// perform gathers
+				for( size_t i = 0; i < block_size; ++i ) {
+					const size_t index = ( already_dense_loop )
+						? ( ( k++ ) + lower_bound )
+						: ( loop_coors.index( k++ ) + lower_bound );
+					offsets[ i ] = index;
+					assert( index < local_n + lower_bound );
+					if( masked ) {
+						if( already_dense_mask ) {
+							mask[ i ] = internal::getCoordinates( *mask_vector ).template
+								mask< descr >( index, mask_p );
+						} else {
+							mask[ i ] = local_mask.template mask< descr >( index - lower_bound,
+								mask_p + lower_bound );
+						}
+					}
+				}
+				// perform gathers
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( !x_scalar ) {
+							x_b[ i ] = x_p[ offsets[ i ] ];
+						}
+						if( !x_scalar && !y_scalar ) {
+							y_m[ i ] = already_dense_chk || chk_coors.assigned( offsets[ i ] -
+								lower_bound );
+						} else {
+							y_m[ i ] = true;
+						}
+						if( !y_scalar ) {
+							y_b[ i ] = y_p[ offsets[ i ] ];
+						}
+					} else {
+						y_m[ i ] = false;
+					}
+				}
+				// perform compute
+				for( size_t i = 0; i < block_size; ++i ) {
+					RC rc = SUCCESS;
+					if( y_m[ i ] ) {
+						rc = apply( z_b[ i ], x_b[ i ], y_b[ i ], op );
+					} else if( monoid ) {
+						if( swap ) {
+							z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+						} else {
+							z_b[ i ] = static_cast< typename OP::D3 >( y_b[ i ] );
+						}
+					}
+					assert( rc == SUCCESS );
+#ifdef NDEBUG
+					(void) rc;
+#endif
+				}
+				// part that may or may not be vectorised (can we do something about this??)
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( y_m[ i ] || monoid ) {
+							(void) local_z.assign( offsets[ i ] - lower_bound );
+						}
+					}
+				}
+				// perform scatter
+				for( size_t i = 0; i < block_size; ++i ) {
+					if( !masked || mask[ i ] ) {
+						if( monoid || y_m[ i ] ) {
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+							                                    // an uninitialised value is if the
+											    // static_assert at the top of this
+							z_p[ offsets[ i ] ] = z_b[ i ];     // function had triggered. See also
+							GRB_UTIL_RESTORE_WARNINGS           // internal issue #321.
+						}
+					}
+				}
+			}
+
+			for( ; k < loop_coors_nz; ++k ) {
+				const size_t index = ( already_dense_loop )
+					? k + lower_bound
+					: loop_coors.index( k ) + lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+							index, mask_p )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( index - lower_bound, mask_p +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+				}
+				RC rc = SUCCESS;
+				(void) local_z.assign( index - lower_bound );
+				if( x_scalar || y_scalar || already_dense_chk || chk_coors.assigned(
+					index - lower_bound )
+				) {
+					rc = apply(
+						z_p[ index ],
+						( x_scalar )
+							? x_wrapper.getValue()
+							: x_p[ index ],
+						( y_scalar )
+							? y_wrapper.getValue()
+							: y_p[ index ],
+						op
+					);
+				} else if( monoid ) {
+					if( swap ) {
+						z_p[ index ] = x_scalar ?
+							static_cast< typename OP::D3 >( x_wrapper.getValue() ) :
+							static_cast< typename OP::D3 >( x_p[ index ] );
+					} else {
+						z_p[ index ] = y_scalar ?
+							static_cast< typename OP::D3 >( y_wrapper.getValue() ) :
+							static_cast< typename OP::D3 >( y_p[ index ] );
+					}
+				}
+				assert( rc == SUCCESS );
+#ifdef NDEBUG
+				(void) rc;
+#endif
+			}
+
+			// cheaper pass #2, only required if we are using monoid semantics
+			// AND if both inputs are vectors
+			if( monoid && !x_scalar && !y_scalar ) {
+				start = 0;
+				end = chk_coors_nz / block_size;
+				k = 0;
+				for( size_t b = start; b < end; ++b ) {
+					// streaming load
+					for( size_t i = 0; i < block_size; i++ ) {
+						offsets[ i ] = ( already_dense_chk )
+							? ( ( k++ ) + lower_bound )
+							: ( chk_coors.index( k++ ) + lower_bound );
+						assert( offsets[ i ] < local_n + lower_bound );
+					}
+					// pure gather
+					for( size_t i = 0; i < block_size; i++ ) {
+						x_m[ i ] = already_dense_loop || loop_coors.assigned( offsets[ i ] -
+							lower_bound );
+					}
+					// gather-like
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( already_dense_mask ) {
+								mask[ i ] = utils::interpretMask< descr >(
+										internal::getCoordinates( *mask_vector ).assigned( offsets[ i ] ),
+										mask_p, offsets[ i ]
+									);
+							} else {
+								mask[ i ] = utils::interpretMask< descr >(
+										local_mask.assigned( offsets[ i ] - lower_bound ),
+										mask_p, offsets[ i ]
+									);
+							}
+						}
+					}
+					// SIMD
+					for( size_t i = 0; i < block_size; i++ ) {
+						x_m[ i ] = ! x_m[ i ];
+					}
+					// SIMD
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							mask[ i ] = mask[ i ] && x_m[ i ];
+						}
+					}
+					if( !swap ) {
+						// gather
+						for( size_t i = 0; i < block_size; ++i ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									y_b[ i ] = y_p[ offsets[ i ] ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									y_b[ i ] = y_p[ offsets[ i ] ];
+								}
+							}
+						}
+						// SIMD
+						for( size_t i = 0; i < block_size; i++ ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									z_b[ i ] = y_b[ i ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									z_b[ i ] = y_b[ i ];
+								}
+							}
+						}
+					} else {
+						// gather
+						for( size_t i = 0; i < block_size; ++i ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									x_b[ i ] = x_p[ offsets[ i ] ];
+								}
+							} else {
+								if( x_m[ i ] ) {
+									x_b[ i ] = x_p[ offsets[ i ] ];
+								}
+							}
+						}
+						// SIMD
+						for( size_t i = 0; i < block_size; i++ ) {
+							if( masked ) {
+								if( mask[ i ] ) {
+									z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+								}
+							} else {
+								if( x_m[ i ] ) {
+									z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+								}
+							}
+						}
+					}
+					// SIMD-like
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( mask[ i ] ) {
+								(void)local_z.assign( offsets[ i ] - lower_bound );
+							}
+						} else {
+							if( x_m[ i ] ) {
+								(void)local_z.assign( offsets[ i ] - lower_bound );
+							}
+						}
+					}
+					// scatter
+					for( size_t i = 0; i < block_size; i++ ) {
+						if( masked ) {
+							if( mask[ i ] ) {
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED
+
+								z_p[ offsets[ i ] ] = z_b[ i ];
+
+								GRB_UTIL_RESTORE_WARNINGS
+							}
+						} else {
+							if( x_m[ i ] ) {
+#ifdef _DEBUG
+								std::cout << "\t\t writing out " << z_b[ i ] << " to index "
+									<< offsets[ i ] << "\n";
+#endif
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+								                                    // an uninitialised value is if the
+												    // static_assert at the top of this
+								z_p[ offsets[ i ] ] = z_b[ i ];     // function had triggered. See also
+								GRB_UTIL_RESTORE_WARNINGS           // internal issue #321.
+							}
+						}
+					}
+				}
+				for( ; k < chk_coors_nz; ++k ) {
+					const size_t index = ( ( already_dense_chk )
+						? k
+						: chk_coors.index( k ) ) + lower_bound;
+					assert( index < local_n + lower_bound );
+					if( already_dense_loop || loop_coors.assigned( index - lower_bound) ) {
+						continue;
+					}
+					if( masked ) {
+						if( already_dense_mask ) {
+							if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+								index, mask_p )
+							) {
+								continue;
+							}
+						} else {
+							if( !local_mask.template mask< descr >( index - lower_bound , mask_p +
+								lower_bound )
+							) {
+								continue;
+							}
+						}
+					}
+					(void) local_z.assign( index - lower_bound );
+					z_p[ index ] = swap ? x_p[ index ] : y_p[ index ];
+				}
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType, typename MaskType,
+			typename InputType1, typename InputType2,
+			typename Coords
+		>
+		RC masked_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			const InputType1 * const left_identity,
+			const InputType2 * const right_identity
+#else
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+#endif
+		) {
+#ifdef _DEBUG
+			std::cout << "In masked_apply_generic< " << left_scalar << ", "
+				<< right_scalar << ", " << left_sparse << ", " << right_sparse << ", "
+				<< descr << " > with lower_bound = " << lower_bound << " and upper_bound = "
+				<< upper_bound << "\n";
+#endif
+			// assertions
+			static_assert( !(left_scalar && left_sparse),
+				"left_scalar and left_sparse cannot both be set!"
+			);
+			static_assert( !(right_scalar && right_sparse),
+				"right_scalar and right_sparse cannot both be set!"
+			);
+			assert( !left_sparse || left_identity != nullptr );
+			assert( !right_sparse || right_identity != nullptr );
+
+			// create local copies of the input const pointers
+			OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const mask_p = internal::getRaw( mask_vector );
+			const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+			const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = ( already_dense_mask )
+				? local_n
+				: local_mask.nonzeroes();
+#ifdef _DEBUG
+			std::cout << "\tinternal::masked_apply_generic called with nnz(mask)="
+				<< local_mask_nz << " and descriptor " << descr << "\n";
+			if( local_mask_nz > 0 ) {
+				std::cout << "\t\tNonzero mask indices: "
+					<< ( already_dense_mask ? 0 : local_mask.index( 0 ) );
+				assert( local_mask.assigned( local_mask.index( 0 ) ) );
+				for( size_t k = 1; k < local_mask_nz; ++k ) {
+					std::cout << ", "
+						<< ( ( already_dense_mask ) ? k : local_mask.index( k ) );
+					assert(
+						already_dense_mask ||
+						local_mask.assigned( local_mask.index( k ) )
+					);
+				}
+				std::cout << "\n";
+			}
+
+			size_t unset = 0;
+			for( size_t i = 0; i < local_n; ++i ) {
+				if( !( already_dense_mask || local_mask.assigned( i ) ) ) {
+					(void) ++unset;
+				}
+			}
+			assert( unset == local_n - local_mask_nz );
+#endif
+			// whether to use a Theta(n) or a Theta(nnz(mask)) loop
+			const bool bigLoop = local_mask_nz == local_n ||
+				(descr & descriptors::invert_mask);
+
+			// get block size
+			constexpr size_t size_t_block_size = config::SIMD_SIZE::value() /
+				sizeof( size_t );
+			constexpr size_t op_block_size = OP::blocksize;
+			constexpr size_t min_block_size = op_block_size > size_t_block_size
+				? size_t_block_size
+				: op_block_size;
+
+			if( bigLoop ) {
+#ifdef _DEBUG
+				std::cerr << "\t in bigLoop variant\n";
+#endif
+				size_t i = 0 + lower_bound;
+
+				constexpr const size_t block_size = op_block_size;
+				const size_t num_blocks = local_n / block_size;
+				const size_t start = 0;
+				const size_t end = num_blocks;
+
+				// declare buffers that fit in a single SIMD register and initialise if
+				// needed
+				bool mask_b[ block_size ];
+				OutputType z_b[ block_size ];
+				InputType1 x_b[ block_size ];
+				InputType2 y_b[ block_size ];
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( left_scalar ) {
+						x_b[ k ] = x_wrapper.getValue();
+					}
+					if( right_scalar ) {
+						y_b[ k ] = y_wrapper.getValue();
+					}
+				}
+				for( size_t b = start; b < end; ++b ) {
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( already_dense_mask ) {
+							mask_b[ k ] = internal::getCoordinates( mask_vector ).template
+								mask< descr >( index, mask_p );
+						} else {
+							mask_b[ k ] = local_mask.template
+								mask< descr >( index - lower_bound, mask_p + lower_bound );
+						}
+					}
+					// check for no output
+					if( left_sparse && right_sparse ) {
+						for( size_t k = 0; k < block_size; ++k ) {
+							const size_t index = i + k;
+							assert( index < local_n + lower_bound );
+							if( mask_b[ k ] ) {
+								if( !( already_dense_input_x ||
+										local_x.assigned( index - lower_bound )
+									) && !(
+										already_dense_input_y ||
+										local_y.assigned( index - lower_bound )
+									)
+								) {
+									mask_b[ k ] = false;
+								}
+							}
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( mask_b[ k ] ) {
+							if( !left_scalar ) {
+								if( left_sparse && !(
+									already_dense_input_x || local_x.assigned( index - lower_bound )
+								) ) {
+									x_b[ k ] = *left_identity;
+								} else {
+									x_b[ k ] = *( x_p + index );
+								}
+							}
+							if( !right_scalar ) {
+								if( right_sparse && !(
+									already_dense_input_y || local_y.assigned( index - lower_bound )
+								) ) {
+									y_b[ k ] = *right_identity;
+								} else {
+									y_b[ k ] = *( y_p + index );
+								}
+							}
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						if( mask_b[ k ] ) {
+							apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+						}
+					}
+					for( size_t k = 0; k < block_size; ++k ) {
+						const size_t index = i + k;
+						assert( index < local_n + lower_bound );
+						if( mask_b[ k ] ) {
+							(void) local_z.assign( index - lower_bound );
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with
+							*( z_p + index ) = z_b[ k ];        // mask_b[ k ], which in the above
+							GRB_UTIL_RESTORE_WARNINGS           // loop also triggeres initialising
+							                                    // z_b[ k ]
+						}
+					}
+
+					i += block_size;
+				}
+				// scalar coda
+				for(
+					size_t i = end * block_size + lower_bound;
+					i < local_n + lower_bound;
+					++i
+				) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( mask_vector ).template mask< descr >( i,
+							mask_p )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_mask.template mask< descr >( i - lower_bound, mask_p +
+							lower_bound )
+						) {
+							continue;
+						}
+					}
+
+					if( left_sparse && right_sparse ) {
+						if( !( already_dense_input_x || local_x.assigned( i  - lower_bound ) ) &&
+							!( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+						) {
+							continue;
+						}
+					}
+					(void) local_z.assign( i - lower_bound );
+					const InputType1 x_e = left_scalar
+							? x_wrapper.getValue()
+							: ( (!left_sparse || already_dense_input_x ||
+								local_x.assigned( i - lower_bound ))
+								? *(x_p + i)
+								: *left_identity
+							);
+					const InputType2 y_e = right_scalar
+							? y_wrapper.getValue()
+							: ( (!right_sparse || already_dense_input_y ||
+								local_y.assigned( i - lower_bound ))
+								? *(y_p + i)
+								: *right_identity
+							);
+					OutputType * const z_e = z_p + i;
+					apply( *z_e, x_e, y_e, op );
+				}
+			} else {
+#ifdef _DEBUG
+				std::cerr << "\t in smallLoop variant\n";
+#endif
+				// declare buffers that fit in a single SIMD register and initialise if
+				// needed
+				constexpr const size_t block_size = size_t_block_size > 0
+					? min_block_size
+					: op_block_size;
+				bool mask_b[ block_size ];
+				OutputType z_b[ block_size ];
+				InputType1 x_b[ block_size ];
+				InputType2 y_b[ block_size ];
+				size_t indices[ block_size ];
+				for( size_t k = 0; k < block_size; ++k ) {
+					if( left_scalar ) {
+						x_b[ k ] = x_wrapper.getValue();
+					}
+					if( right_scalar ) {
+						y_b[ k ] = y_wrapper.getValue();
+					}
+				}
+
+				// loop over mask pattern
+				const size_t mask_nnz = local_mask_nz;
+				const size_t num_blocks = mask_nnz / block_size;
+				const size_t start = 0;
+				const size_t end = num_blocks;
+
+				size_t k = 0;
+
+				// vectorised code
+				for( size_t b = start; b < end; ++b ) {
+					for( size_t t = 0; t < block_size; ++t ) {
+						indices[ t ] = (already_dense_mask ) ? k + t : local_mask.index( k + t );
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( already_dense_mask ) {
+							mask_b[ t ] = internal::getCoordinates( mask_vector ).template
+								mask< descr >( indices[ t ], mask_p );
+						} else {
+							mask_b[ t ] = local_mask.template
+								mask< descr >( indices[ t ], mask_p + lower_bound );
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							if( !left_scalar ) {
+								if( left_sparse && !( already_dense_input_x ||
+									local_x.assigned( indices[ t ] ) )
+								) {
+									x_b[ t ] = *left_identity;
+								} else {
+									x_b[ t ] = *( x_p + indices[ t ] + lower_bound );
+								}
+							}
+							if( !right_scalar ) {
+								if( right_sparse && !( already_dense_input_y ||
+									local_y.assigned( indices[ t ] ) )
+								) {
+									y_b[ t ] = *right_identity;
+								} else {
+									y_b[ t ] = *( y_p + indices[ t ] + lower_bound );
+								}
+							}
+						}
+					}
+					// check for no output
+					if( left_sparse && right_sparse ) {
+						for( size_t t = 0; t < block_size; ++t ) {
+							const size_t index = indices[ t ];
+							assert( index < local_n + lower_bound );
+							if( mask_b[ t ] ) {
+								if( !( already_dense_input_x || local_x.assigned( index ) ) &&
+									!( already_dense_input_y || local_y.assigned( index ) )
+								) {
+									mask_b[ t ] = false;
+								}
+							}
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							apply( z_b[ t ], x_b[ t ], y_b[ t ], op );
+						}
+					}
+					for( size_t t = 0; t < block_size; ++t ) {
+						if( mask_b[ t ] ) {
+							(void) local_z.assign( indices[ t ] );
+							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED               // z_b is computed from
+							*( z_p + indices[ t ] + lower_bound ) = z_b[ t ]; // x_b and y_b, which
+							GRB_UTIL_RESTORE_WARNINGS                         // are both initialised
+							                                                  // if mask_b is true
+						}
+					}
+
+					k += block_size;
+				}
+
+				// scalar coda
+				for( size_t k = end * block_size; k < mask_nnz; ++k ) {
+					const size_t i = already_dense_mask
+						? k + lower_bound
+						: local_mask.index( k ) + lower_bound;
+					if( ( already_dense_mask &&
+							internal::getCoordinates( mask_vector ).template mask< descr >(
+								i, mask_p
+							)
+						) || local_mask.template mask< descr >(
+							i - lower_bound, mask_p + lower_bound
+						)
+					) {
+						if( left_sparse && right_sparse ) {
+							if( !( already_dense_input_x || local_x.assigned( i  - lower_bound ) ) &&
+								!( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+							) {
+								continue;
+							}
+						}
+						(void) local_z.assign( i - lower_bound );
+						const InputType1 x_e = left_scalar
+							? x_wrapper.getValue()
+							: (
+								(!left_sparse || already_dense_input_x ||
+									local_x.assigned( i - lower_bound ) )
+									? *(x_p + i)
+									: *left_identity
+							);
+						const InputType2 y_e = right_scalar
+							? y_wrapper.getValue()
+							: (
+								(!right_sparse || already_dense_input_y ||
+									local_y.assigned( i - lower_bound ) )
+									? *(y_p + i)
+									: *right_identity
+							);
+						OutputType * const z_e = z_p + i;
+						apply( *z_e, x_e, y_e, op );
+					}
+				}
+			}
+			return SUCCESS;
+		}
+
+	} // end namespace ``grb::internal''
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n";
+#endif
+		// sanity check
+		auto &z_coors = internal::getCoordinates( z );
+		const size_t n = z_coors.size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func =
+			[&z, &x, beta, &op] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, operator) in "
+					<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+				const Coords * const local_null_mask = nullptr;
+
+				Coords local_mask, local_x, local_y, local_z;
+				const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				(void) pipeline;
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_input_x = true;
+
+				size_t local_x_nz = local_n;
+
+				if( !already_dense_vectors ) {
+					local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+						upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_input_x = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input_x ) {
+#else
+						already_dense_input_x = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+				// the global stack counter must be set to 0 unless it's guaranteed
+				// that none of the local_clear and local_assignAll will be invoked
+				// - local_clear is not invoked when the dense descriptor is given,
+				//   since the output vector will eventually become dense
+				// - local_assignAll is not invoked when the output vector is already dense
+				// therefore, the following condition relies on global information,
+				// i.e., the dense descriptor and the already_dense_output
+				if( !already_dense_vectors ) {
+					if( lower_bound == 0 ) {
+						internal::getCoordinates( z ).reset_global_nnz_counter();
+					}
+				}
+
+				if( local_x_nz == local_n ) {
+					if( !already_dense_vectors ) {
+						local_z.local_assignAll( );
+					}
+
+					// call dense apply
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+					rc = internal::dense_apply_generic<
+#endif
+							false, true, false, false, descr | descriptors::dense, OP,
+							OutputType, InputType1, InputType2, Coords
+						>(
+							already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_x, local_y,
+							z, x_wrapper, y_wrapper,
+							op
+						);
+				} else {
+					if( !already_dense_vectors ) {
+						local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					}
+
+					// since z and x may not perfectly overlap, and since the intersection is
+					// unknown a priori, we must iterate over the nonzeroes of x
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+					rc = internal::sparse_apply_generic<
+#endif
+							false, false, false, true, descr, OP,
+							OutputType, bool, InputType1, InputType2, Coords
+						>(
+							true, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, local_null_mask, local_x, local_y,
+							z, null_mask, x_wrapper, y_wrapper, op
+						);
+				}
+
+				if( !already_dense_vectors ) {
+					internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, operator)"
+			<< std::endl;
+#endif
+
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		if( (descr & descriptors::dense) && nnz( z ) < size( z ) ) {
+			return ILLEGAL;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, val );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		// check trivial dispatch
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, beta, op, phase );
+		}
+
+		// dynamic checks
+		if( size( mask ) != size( z ) ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) &&
+			( nnz( z ) < size( z ) || nnz( mask ) < size( mask ) )
+		) {
+			return ILLEGAL;
+		}
+
+		// check trivial dispatch
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, mask, val );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, mask, alpha, beta, monoid.getOperator(),
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using operator)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, op );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func =
+			[&z, &mask, &x, beta, &op] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, "
+					<< "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+					<< std::endl;
+#endif
+				RC rc = SUCCESS;
+
+				Coords local_mask, local_x, local_y, local_z;
+				const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				const bool mask_is_dense = (descr & descriptors::structural) &&
+					!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+				bool already_dense_mask = true;
+				bool already_dense_input_x = true;
+
+				size_t local_mask_nz = local_n;
+				size_t local_x_nz = local_n;
+
+				if( !mask_is_dense ) {
+					local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+						upper_bound );
+					if( dense_descr && local_z.nonzeroes() < local_n ) {
+						return ILLEGAL;
+					}
+				}
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_mask = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( mask ) );
+					if( !already_dense_mask ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+							upper_bound );
+						local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+
+					already_dense_input_x = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( x ) );
+					if( !already_dense_input_x ) {
+#else
+						already_dense_mask = false;
+#endif
+						local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+							upper_bound );
+						local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+				if( !mask_is_dense ) {
+					// the output sparsity structure is implied by mask and descr
+					local_z.local_clear();
+					if( lower_bound == 0 ) {
+						internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+						if( dense_descr ) {
+							pipeline.markMaybeSparseDenseDescriptorVerification(
+								&internal::getCoordinates( z ) );
+						}
+					}
+				}
+
+				if(
+					(descr & descriptors::dense) ||
+					(local_x_nz == local_n) ||
+					(local_mask_nz <= local_x_nz)
+				) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+					rc = internal::masked_apply_generic<
+#endif
+							false, true, false, false, descr, OP,
+							OutputType, MaskType, InputType1, InputType2, Coords
+						>(
+							already_dense_mask, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, local_mask, local_x, local_y,
+							z, mask, x_wrapper, y_wrapper,
+							op
+						);
+				} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+					rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+					rc = internal::sparse_apply_generic<
+#endif
+							true, false, false, true, descr, OP,
+							OutputType, bool, InputType1, InputType2, Coords
+						>(
+							already_dense_mask, already_dense_input_x, true,
+							lower_bound, upper_bound,
+							local_z, &local_mask, local_x, local_y,
+							z, &mask, x_wrapper, y_wrapper,
+							op
+						);
+				}
+
+				if( !mask_is_dense ) {
+					internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &mask, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, x, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, y, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			( void )pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the unmasked sparse variant
+			const auto op = monoid.getOperator();
+
+			if( !already_dense_vectors ) {
+				// z will have an a-priori unknown sparsity structure
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+			rc = internal::sparse_apply_generic<
+#endif
+					false, true, false, false, descr, typename Monoid::Operator,
+					OutputType, bool, InputType1, InputType2, Coords
+				>(
+					true, already_dense_input_x, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_z, local_null_mask, local_x, local_y,
+					z, null_mask, x_wrapper, y_wrapper,
+					op
+				);
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, alpha, y, monoid.getOperator() );
+		}
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#endif
+			bool already_dense_input_y = true;
+
+			// when it's guaranteed that the output will become dense
+			// the only criterion to avoid reading the local coordinates is if it the
+			// output is already dense
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !already_dense_output ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the unmasked sparse variant
+			const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#endif
+				local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			// dispatch to generic function
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+			rc = internal::dense_apply_generic<
+#endif
+					true, false, false, true, descr, typename Monoid::Operator,
+					OutputType, InputType1, InputType2, Coords
+				>(
+					true, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_x, local_y,
+					z, x_wrapper, y_wrapper, op
+				);
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+#ifdef _DEBUG
+		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, x, beta, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, beta, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, monoid) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#endif
+			bool already_dense_input_x = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !already_dense_output ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+			// we are in the unmasked sparse variant
+			const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#endif
+				// the result will always be dense
+				local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			}
+#endif
+
+			// dispatch
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+			rc = internal::dense_apply_generic<
+#endif
+					false, true, true, false, descr, typename Monoid::Operator,
+					OutputType, InputType1, InputType2, Coords
+				>(
+					already_dense_input_x, true,
+					lower_bound, upper_bound,
+					local_x, local_y,
+					z, x_wrapper, y_wrapper,
+					op
+				);
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !already_dense_output ) {
+#else
+
+			if( !already_dense_vectors ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, monoid)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, monoid, phase );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, mask, x, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, monoid) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the masked sparse variant
+			const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+			const InputType2 right_identity =
+				monoid.template getIdentity< InputType2 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				// z will have an a priori unknown sparsity structure
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( local_x_nz < local_n &&
+				local_y_nz < local_n &&
+				local_x_nz + local_y_nz < local_mask_nz
+			) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, true, false, false, descr, typename Monoid::Operator,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else if( local_x_nz < local_n && local_y_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, true, false, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						&left_identity, nullptr
+					);
+			} else if( local_y_nz < local_n && local_x_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, false, true, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						nullptr, &right_identity
+					);
+			} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, true, true, descr, typename Monoid::Operator,
+						OutputType, MaskType, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op,
+						&left_identity, &right_identity
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, &mask, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&internal::getCoordinates( mask ), nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, monoid );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( descr & descriptors::dense ) {
+			return eWiseApply< descr >( z, mask, alpha, y, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, monoid) "
+				<< "in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			// we are in the masked sparse variant
+			const InputType2 right_identity =
+				monoid.template getIdentity< InputType2 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				// the sparsity structure of z will be a result of the given mask and descr
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+			rc = internal::masked_apply_generic<
+#endif
+					true, false, false, true, descr, typename Monoid::Operator,
+					OutputType, MaskType, InputType1, InputType2, Coords
+				>(
+					already_dense_mask, true, already_dense_input_y,
+					lower_bound, upper_bound,
+					local_z, local_mask, local_x, local_y,
+					z, mask, x_wrapper, y_wrapper,
+					op,
+					nullptr, &right_identity
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, &mask, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_monoid< Monoid >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, beta, monoid );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch to dense variant
+		if( (descr & descriptors::dense) ) {
+			return eWiseApply< descr >( z, mask, x, beta, monoid.getOperator() );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, beta, &monoid] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, monoid) "
+				<< "in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+				already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+			// we are in the masked sparse variant
+			const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+			const auto &op = monoid.getOperator();
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+			rc = internal::masked_apply_generic<
+#endif
+					false, true, true, false, descr, typename Monoid::Operator,
+					OutputType, MaskType, InputType1, InputType2, Coords
+				>(
+					already_dense_mask, already_dense_input_x, true,
+					lower_bound, upper_bound,
+					local_z, local_mask, local_x, local_y,
+					z, mask, x_wrapper, y_wrapper,
+					op,
+					&left_identity
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &mask, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+			<< "monoid)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n";
+#endif
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// check if we can dispatch
+		if( static_cast< const void * >( &z ) ==
+			static_cast< const void * >( &y )
+		) {
+			return foldr< descr >( alpha, z, op );
+		}
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, alpha, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, operator) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+				}
+			}
+
+			// check for dense variant
+			if( (descr & descriptors::dense) || local_y_nz == local_n ) {
+				if( !already_dense_vectors ) {
+					local_z.local_assignAll( );
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+				rc = internal::dense_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, InputType1, InputType2, Coords
+					>(
+						true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_x, local_y, z,
+						x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+
+				// we are in the sparse variant
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+						false, false, true, false, descr, OP,
+#else
+				rc = internal::sparse_apply_generic<
+						false, false, true, false, descr, OP,
+#endif
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						true, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_null_mask, local_x, local_y,
+						z, null_mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator variant)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, y, op );
+		}
+
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, "
+				<< "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+				<< std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( (descr & descriptors::dense) ||
+				(local_y_nz == local_n) ||
+				local_mask_nz <= local_y_nz
+			) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, false, true, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, true, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&y, &mask, nullptr, nullptr,
+				&internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+#endif
+		// sanity check
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ||
+			internal::getCoordinates( y ).size() != n
+		) {
+#ifdef _DEBUG
+			std::cerr << "\tinput vectors mismatch in dimensions!\n";
+#endif
+			return MISMATCH;
+		}
+
+		// check for possible shortcuts
+		// trivial dispatch
+		if( n == 0 ) {
+			return SUCCESS;
+		}
+
+		// check for possible shortcuts, after dynamic checks
+		if( getID( x ) == getID( y ) && is_idempotent< OP >::value ) {
+			return set< descr >( z, x, phase );
+		}
+		if( getID( x ) == getID( z ) ) {
+			return foldl< descr >( z, y, op, phase );
+		}
+		if( getID( y ) == getID( z ) ) {
+			return foldr< descr >( x, z, op, phase );
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, &x, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, x, y, operator) in the "
+				<< "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+			const Coords * const local_null_mask = nullptr;
+
+			Coords local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !already_dense_vectors ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+					if( local_y_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+				}
+			}
+
+			if( sparse ) {
+				if( !already_dense_vectors ) {
+					local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+				}
+
+				const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+				const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						false, false, false, false, descr | descriptors::dense, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						true, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_null_mask, local_x, local_y,
+						z, null_mask, x_wrapper, y_wrapper,
+						op
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_z.local_assignAll( );
+				}
+
+				if( upper_bound > lower_bound ) {
+					const InputType1 * __restrict__ a = internal::getRaw( x );
+					const InputType2 * __restrict__ b = internal::getRaw( y );
+					OutputType * __restrict__ c = internal::getRaw( z );
+
+					// this function is vectorised
+					op.eWiseApply( a + lower_bound, b + lower_bound, c + lower_bound, local_n);
+				}
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, operator)"
+			<< std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< OP >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using operator)\n";
+#endif
+		// check for empty mask
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, x, y, op, phase );
+		}
+
+		// other run-time checks
+		const size_t n = internal::getCoordinates( z ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( y ).size() != n ) {
+			return MISMATCH;
+		}
+		if( internal::getCoordinates( mask ).size() != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &op] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, operator) in "
+				<< "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y, local_z;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			bool already_dense_mask = true;
+			bool already_dense_input_x = true;
+			bool already_dense_input_y = true;
+
+			if( !mask_is_dense ) {
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+				if( dense_descr && local_z.nonzeroes() < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+					already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+			const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+			const size_t sparse_loop = std::min( local_x_nz, local_y_nz );
+
+			if( !mask_is_dense ) {
+				local_z.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( z ) );
+					}
+				}
+			}
+
+			if( (descr & descriptors::dense) ||
+				(local_x_nz == local_n && local_y_nz == local_n) ||
+				( !(descr & descriptors::invert_mask) && sparse_loop >= local_mask_nz )
+			) {
+				// use loop over mask
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+				rc = internal::masked_apply_generic<
+#endif
+						false, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, local_mask, local_x, local_y,
+						z, mask, x_wrapper, y_wrapper,
+						op
+					);
+
+			} else {
+				// use loop over sparse inputs
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+				rc = internal::sparse_apply_generic<
+#endif
+						true, false, false, false, descr, OP,
+						OutputType, bool, InputType1, InputType2, Coords
+					>(
+						already_dense_mask, already_dense_input_x, already_dense_input_y,
+						lower_bound, upper_bound,
+						local_z, &local_mask, local_x, local_y,
+						z, &mask, x_wrapper, y_wrapper,
+						op
+					);
+			}
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+				n, sizeof( OutputType ), dense_descr, dense_mask,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, &mask, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&internal::getCoordinates( mask ), nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+			<< "operator)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the fourth domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- vector + vector) dispatches to "
+			<< "two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- scalar + vector) dispatches to "
+			<< "two folds with the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- vector + scalar) dispatches to "
+			<< "two folds with the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, beta, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- scalar + scalar) dispatches to "
+			<< "foldl with precomputed scalar and additive monoid\n";
+#endif
+		const typename Ring::D4 add;
+		(void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+		return foldl< descr >( z, add, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the fourth domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- vector + vector, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- vector + vector, masked) "
+			<< "dispatches to two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- scalar + vector, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- scalar + vector, masked) "
+			<< "dispatches to two folds using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- vector + scalar, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- vector + scalar, masked) "
+			<< "dispatches to eWiseApply using the additive monoid\n";
+#endif
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, beta, ring.getAdditiveMonoid(),
+			phase );
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< OutputType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< MaskType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::eWiseAdd (vector <- scalar + scalar, masked)",
+			"called with non-bool mask element types" );
+#ifdef _DEBUG
+		std::cout << "eWiseAdd (nonblocking, vector <- scalar + scalar, masked) "
+			<< "dispatches to foldl with precomputed scalar and additive monoid\n";
+#endif
+		const typename Ring::D4 add;
+		(void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+		return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase );
+	}
+
+	// declare an internal version of eWiseMulAdd containing the full sparse &
+	// dense implementations
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC sparse_eWiseMulAdd_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			static_assert( !(descr & descriptors::invert_mask),
+				"Cannot loop over mask nonzeroes if invert_mask is given. "
+				"Please submit a bug report" );
+			static_assert( !a_scalar || !x_scalar,
+				"If both a and x are scalars, this is operation is a simple eWiseApply "
+				"with the additive operator if the semiring." );
+			static_assert( !y_zero || y_scalar,
+				"If y_zero is given, then y_scalar must be given also." );
+
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const m = internal::getRaw( m_vector );
+
+			// create local copies of the input const pointers
+			const InputType1 * __restrict__ const a = a_wrapper.getRaw();
+			const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+			const InputType3 * __restrict__ const y = y_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_m_nz = already_dense_mask ? local_n : local_m.nonzeroes();
+
+			const size_t local_start = 0;
+			const size_t local_end = local_m_nz;
+
+			size_t k = local_start;
+
+			// scalar coda and parallel main body
+			for( ; k < local_end; ++k ) {
+				const size_t index = ( already_dense_mask ? k : local_m.index( k ) ) +
+					lower_bound;
+				assert( index - lower_bound < local_n );
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( m_vector ).template mask< descr >(
+						index, m )
+					) {
+						continue;
+					}
+				} else {
+					if( !local_m.template mask< descr >( index - lower_bound, m +
+						lower_bound )
+					) {
+						continue;
+					}
+				}
+				typename Ring::D3 t = ring.template getZero< typename Ring::D3 >();
+				if(
+					(
+						a_scalar || already_dense_input_a ||
+						local_a.assigned( index - lower_bound )
+					) && (
+						x_scalar || already_dense_input_x ||
+						local_x.assigned( index - lower_bound)
+					)
+				) {
+					const InputType1 a_p = ( a_scalar )
+						? a_wrapper.getValue()
+						: *( a + index );
+					const InputType2 x_p = ( x_scalar )
+						? x_wrapper.getValue()
+						: *( x + index );
+					(void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+					if( !y_zero && (
+						y_scalar || already_dense_input_y ||
+						local_y.assigned( index - lower_bound ) )
+					) {
+						const InputType3 y_p = ( y_scalar )
+							? y_wrapper.getValue()
+							: *( y + index );
+						typename Ring::D4 b;
+						(void) apply( b, t, y_p, ring.getAdditiveOperator() );
+						if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+							typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+							(void) foldr( b, out, ring.getAdditiveOperator() );
+							z[ index ] = static_cast< OutputType >( out );
+						} else {
+							(void) local_z.assign( index - lower_bound );
+							z[ index ] = static_cast< OutputType >( b );
+						}
+					} else if( already_dense_output ||
+						local_z.assigned( index - lower_bound )
+					) {
+						typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( t, out, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( out );
+					} else {
+						(void) local_z.assign( index - lower_bound );
+						z[ index ] = static_cast< OutputType >( t );
+					}
+				} else if( !y_zero && (
+					already_dense_input_y || y_scalar ||
+					local_y.assigned( index - lower_bound ) )
+				) {
+					if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+						typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( y[ index ], out, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( out );
+					} else {
+						(void)local_z.assign( index - lower_bound );
+						z[ index ] = static_cast< OutputType >( t );
+					}
+				}
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const m_vector,
+			const Vector< InputType1, nonblocking, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+			const MaskType * __restrict__ const m = masked
+				? internal::getRaw( *m_vector )
+				: nullptr;
+			const InputType1 * __restrict__ const a = internal::getRaw( a_vector );
+
+			// create local copies of the input const pointers
+			const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_a_nz = already_dense_input_a
+				? local_n
+				: local_a.nonzeroes();
+
+			for( size_t i = 0; i < local_a_nz; ++i ) {
+				const size_t index = ( already_dense_input_a ? i : local_a.index( i ) ) +
+					lower_bound;
+				if( masked ) {
+					if( already_dense_mask ) {
+						if( !internal::getCoordinates( *m_vector ).template mask< descr >(
+							index, m )
+						) {
+							continue;
+						}
+					} else {
+						if( !local_m->template mask< descr >( index - lower_bound,
+							m + lower_bound )
+						) {
+							continue;
+						}
+					}
+				}
+
+				if( x_scalar || already_dense_input_x ||
+					local_x.assigned( index - lower_bound )
+				) {
+					typename Ring::D3 t;
+					const InputType1 a_p = *( a + index );
+					const InputType2 x_p = ( x_scalar )
+						? x_wrapper.getValue()
+						: *( x + index );
+
+					if( mulSwitched ) {
+						(void) apply( t, x_p, a_p, ring.getMultiplicativeOperator() );
+					} else {
+						(void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+					}
+
+					if( already_dense_output || local_z.assign( index - lower_bound ) ) {
+						typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] );
+						(void) foldr( t, b, ring.getAdditiveOperator() );
+						z[ index ] = static_cast< OutputType >( b );
+					} else {
+						z[ index ] = static_cast< OutputType >(
+							static_cast< typename Ring::D4 >( t )
+						);
+					}
+				}
+			}
+
+			RC rc = SUCCESS;
+
+			// now handle addition
+			if( !y_zero ) {
+				// now handle addition
+				if( masked ) {
+					if( y_scalar ) {
+						rc = internal::fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, true, true,
+								already_dense_output, already_dense_mask
+#else
+								descr, true, true, true, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_mask,
+#endif
+								lower_bound, upper_bound, local_z, local_m,
+								z_vector, m_vector, y_wrapper.getValue(),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					} else {
+						rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, true, true,
+								already_dense_output, already_dense_input_y, already_dense_mask
+#else
+								descr, true, true, true, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m, local_y,
+								z_vector, m_vector, *( y_wrapper.getPointer() ),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					}
+				} else {
+					if( y_scalar ) {
+						rc = fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, false, true,
+								already_dense_output, already_dense_mask
+#else
+								descr, true, true, false, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m,
+								z_vector, m_vector, y_wrapper.getValue(),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					} else {
+						rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+								descr, true, true, false, true,
+								already_dense_output, already_dense_input_y, already_dense_mask
+#else
+								descr, true, true, false, true
+#endif
+							>(
+#ifndef GRB_BOOLEAN_DISPATCHER
+								already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+								lower_bound, upper_bound,
+								local_z, local_m, local_y,
+								z_vector, m_vector, *( y_wrapper.getPointer() ),
+								ring.getAdditiveMonoid().getOperator(), EXECUTE
+							);
+					}
+				}
+			}
+
+			// done
+			return rc;
+		}
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool assign_z,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC dense_eWiseMulAdd(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring = Ring()
+		) {
+#ifdef _DEBUG
+			std::cout << "\tdense_eWiseMulAdd: loop size will be "
+				<< (upper_bound - lower_bound) << " in the range(" << lower_bound << ", "
+				<< upper_bound << ")\n";
+#endif
+			const size_t start = lower_bound;
+			const size_t end = upper_bound;
+
+			OutputType * __restrict__ z = internal::getRaw( z_vector );
+
+			// create local copies of the input const pointers
+			const InputType1 * __restrict__ a = a_wrapper.getRaw();
+			const InputType2 * __restrict__ x = x_wrapper.getRaw();
+			const InputType3 * __restrict__ y = y_wrapper.getRaw();
+
+			assert( z != a );
+			assert( z != x );
+			assert( z != y );
+			assert( a != x || a == nullptr );
+			assert( a != y || a == nullptr );
+			assert( x != y || x == nullptr );
+
+			// vector registers
+			typename Ring::D1 aa[ Ring::blocksize ];
+			typename Ring::D2 xx[ Ring::blocksize ];
+			typename Ring::D3 tt[ Ring::blocksize ];
+			typename Ring::D4 bb[ Ring::blocksize ];
+			typename Ring::D4 yy[ Ring::blocksize ];
+			typename Ring::D4 zz[ Ring::blocksize ];
+
+			if( a_scalar ) {
+				for( size_t b = 0; b < Ring::blocksize; ++b ) {
+					aa[ b ] = a_wrapper.getValue();
+				}
+			}
+			if( x_scalar ) {
+				for( size_t b = 0; b < Ring::blocksize; ++b ) {
+					xx[ b ] = x_wrapper.getValue();
+				}
+			}
+			if( y_scalar ) {
+				if( y_zero ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = ring.template getZero< typename Ring::D4 >();
+					}
+				} else {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = y_wrapper.getValue();
+					}
+				}
+			}
+
+			// do vectorised out-of-place operations. Allows for aligned overlap.
+			// Non-aligned ovelap is not possible due to GraphBLAS semantics.
+			size_t i = start;
+			// note: read the tail code (under this while loop) comments first for
+			// greater understanding
+			while( i + Ring::blocksize <= end ) {
+#ifdef _DEBUG
+				std::cout << "\tdense_eWiseMulAdd: handling block of size "
+					<< Ring::blocksize << " starting at index " << i << "\n";
+#endif
+				// read-in
+				if( !a_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						aa[ b ] = static_cast< typename Ring::D2 >( a[ i + b ] );
+					}
+				}
+				if( !x_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						xx[ b ] = static_cast< typename Ring::D2 >( x[ i + b ] );
+					}
+				}
+				if( !y_scalar ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] );
+					}
+				}
+				if( !assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] );
+					}
+				}
+
+				// operate
+				if( !y_zero ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+						apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
+					}
+				} else {
+					assert( y_scalar );
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						apply( bb[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+					}
+				}
+				if( !assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b ) {
+						foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() );
+					}
+				}
+
+				// write-out
+				if( assign_z ) {
+					for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+						z[ i ] = static_cast< OutputType >( bb[ b ] );
+					}
+				} else {
+					for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+						z[ i ] = static_cast< OutputType >( zz[ b ] );
+					}
+				}
+			}
+
+			// perform tail
+			if( !a_scalar ) {
+				a += i;
+			}
+			if( !x_scalar ) {
+				x += i;
+			}
+			if( !y_scalar ) {
+				y += i;
+			}
+			z += i;
+			for( ; i < end; ++i ) {
+				// do multiply
+				const typename Ring::D1 &as = ( a_scalar )
+					? static_cast< typename Ring::D1 >( a_wrapper.getValue() )
+					: static_cast< typename Ring::D1 >( *a );
+				const typename Ring::D2 &xs = ( x_scalar )
+					? static_cast< typename Ring::D2 >( x_wrapper.getValue() )
+					: static_cast< typename Ring::D2 >( *x );
+				typename Ring::D4 ys = ( y_scalar )
+					? static_cast< typename Ring::D4 >( y_wrapper.getValue() )
+					: static_cast< typename Ring::D4 >( *y );
+				typename Ring::D3 ts;
+
+				if( !y_zero ) {
+					RC always_succeeds = apply( ts, as, xs, ring.getMultiplicativeOperator() );
+					assert( always_succeeds == SUCCESS );
+					always_succeeds = foldr( ts, ys, ring.getAdditiveOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				} else {
+					RC always_succeeds = apply( ys, as, xs, ring.getMultiplicativeOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				}
+
+				// write out
+				if( assign_z ) {
+					*z = static_cast< OutputType >( ys );
+				} else {
+					RC always_succeeds = foldr( ys, *z, ring.getAdditiveOperator() );
+					assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+					(void) always_succeeds;
+#endif
+				}
+
+				// move pointers
+				if( !a_scalar ) {
+					(void)a++;
+				}
+				if( !x_scalar ) {
+					(void)x++;
+				}
+				if( !y_scalar ) {
+					(void)y++;
+				}
+				(void)z++;
+			}
+
+			// done
+			return SUCCESS;
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			typename MaskType,
+			class Ring,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename OutputType,
+			typename Coords
+		>
+		RC eWiseMulAdd_dispatch(
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const size_t n,
+			const Ring &ring
+		) {
+			static_assert( !y_zero || y_scalar, "If y is zero, y_scalar must be true. "
+				"Triggering this assertion indicates an incorrect call to this "
+				"function; please submit a bug report" );
+#ifdef _DEBUG
+			std::cout << "\t in eWiseMulAdd_dispatch\n";
+#endif
+			RC ret = SUCCESS;
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func =
+				[&z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, &ring] (
+					internal::Pipeline &pipeline,
+					const size_t lower_bound, const size_t upper_bound
+				) {
+#ifdef _NONBLOCKING_DEBUG
+					#pragma omp critical
+					std::cout << "\t\tExecution of stage eWiseMulAdd_dispatch in the range("
+						<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+					RC rc = SUCCESS;
+
+					Coords local_z, local_m, local_a, local_x, local_y;
+					const size_t local_n = upper_bound - lower_bound;
+					size_t local_z_nz = local_n;
+					size_t local_m_nz = local_n;
+					size_t local_a_nz = local_n;
+					size_t local_x_nz = local_n;
+					size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					const bool already_dense_vectors = dense_descr ||
+						pipeline.allAlreadyDenseVectors();
+#else
+					(void) pipeline;
+					constexpr const bool already_dense_vectors = dense_descr;
+#endif
+					bool already_dense_output = true;
+					bool already_dense_mask = true;
+					bool already_dense_input_a = true;
+					bool already_dense_input_x = true;
+					bool already_dense_input_y = true;
+
+					if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_output = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( z_vector ) );
+						if( !already_dense_output ) {
+#else
+							already_dense_output = false;
+#endif
+							local_z = internal::getCoordinates( z_vector ).asyncSubset( lower_bound,
+								upper_bound );
+							local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+						if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_mask = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( *m_vector ) );
+							if( !already_dense_mask ) {
+#else
+								already_dense_mask = false;
+#endif
+								local_m = internal::getCoordinates( *m_vector ).asyncSubset(
+									lower_bound, upper_bound );
+								local_m_nz = local_m.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !a_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_a = pipeline.containsAlreadyDenseVector(
+								a_wrapper.getCoordinates() );
+							if ( !already_dense_input_a ) {
+#else
+								already_dense_input_a = false;
+#endif
+								local_a = a_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_a_nz = local_a.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !x_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_x = pipeline.containsAlreadyDenseVector(
+								x_wrapper.getCoordinates() );
+							if( !already_dense_input_x ) {
+#else
+								already_dense_input_x = false;
+#endif
+								local_x = x_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						if( !y_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_y = pipeline.containsAlreadyDenseVector(
+								y_wrapper.getCoordinates() );
+							if( !already_dense_input_y ) {
+#else
+								already_dense_input_y = false;
+#endif
+								local_y = y_wrapper.getCoordinates()->asyncSubset( lower_bound,
+									upper_bound );
+								local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+					}
+
+					// check whether we are in the sparse or dense case
+					const bool mask_is_dense = !masked || (
+							(descr & descriptors::structural) &&
+							!(descr & descriptors::invert_mask) &&
+							local_m_nz == local_n
+						);
+					const size_t z_nns = local_z_nz;
+
+					// the below Boolean shall be true only if the inputs a, x, and y generate
+					// a dense output vector. It furthermore shall be set to false only if the
+					// output vector was either empty or fully dense. This is done to determine
+					// the exact case the dense variant of the eWiseMulAdd implementations can
+					// be used.
+					const bool sparse = ( a_scalar ? false : ( local_a_nz < local_n ) ) ||
+						( x_scalar ? false : ( local_x_nz < local_n ) ) ||
+						( y_scalar ? false : ( local_y_nz < local_n ) ) ||
+						( z_nns > 0 && z_nns < local_n ) ||
+						( masked && !mask_is_dense );
+					assert( !(sparse && dense_descr) );
+#ifdef _DEBUG
+					std::cout << "\t\t (sparse, dense)=(" << sparse << ", " << dense_descr
+						<< ")\n";
+#endif
+					// pre-assign coors if output is dense but was previously totally empty
+					const bool assign_z = z_nns == 0 && !sparse;
+
+					if( assign_z ) {
+#ifdef _DEBUG
+						std::cout << "\t\t detected output will be dense while "
+							<< "the output vector presently is completely empty. We therefore "
+							<< "pre-assign all output coordinates\n";
+#endif
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						if( !already_dense_output ) {
+#endif
+							// the result will always be dense
+							local_z.local_assignAllNotAlreadyAssigned();
+							local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+					}
+
+					if( !dense_descr && sparse ) {
+						// the below computes loop sizes multiplied with the number of vectors that
+						// each loop needs to touch. Possible vectors are: z, m, a, x, and y.
+						const size_t mask_factor = masked ? 1 : 0;
+						const size_t mul_loop_size = ( 3 + mask_factor ) * std::min(
+								( a_scalar ? local_n : local_a_nz ),
+								( x_scalar ? local_n : local_x_nz )
+							) + ( y_zero ? 0 :
+								(2 + mask_factor) * ( y_scalar ? local_n : local_y_nz )
+							);
+#ifdef _DEBUG
+						std::cout << "\t\t mul_loop_size = " << mul_loop_size << "\n";
+#endif
+
+						const size_t mask_loop_size = ( y_zero ? 4 : 5 ) * local_m_nz;
+
+						if( masked && mask_loop_size < mul_loop_size ) {
+#ifdef _DEBUG
+							std::cout << "\t\t mask_loop_size= " << mask_loop_size << "\n";
+							std::cout << "\t\t will be driven by output mask\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							rc = boolean_dispatcher_sparse_eWiseMulAdd_maskDriven<
+#else
+							rc = sparse_eWiseMulAdd_maskDriven<
+#endif
+									descr, a_scalar, x_scalar, y_scalar, y_zero
+								>(
+									already_dense_output, already_dense_mask, already_dense_input_a,
+									already_dense_input_x, already_dense_input_y,
+									lower_bound, upper_bound,
+									local_z, local_m, local_a, local_x, local_y,
+									z_vector, *m_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						} else {
+#ifdef _DEBUG
+							std::cout << "\t\t will be driven by the multiplication a*x\n";
+#endif
+							static_assert( !(a_scalar && x_scalar),
+								"The case of the multiplication being between two scalars should have"
+								"been caught earlier. Please submit a bug report." );
+
+							if( a_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, a_scalar, y_scalar, y_zero, true
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_x,
+										already_dense_input_a, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_x, local_a, local_y,
+										z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+										ring
+									);
+							} else if( x_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, x_scalar, y_scalar, y_zero, false
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_a,
+										already_dense_input_x, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_a, local_x, local_y,
+										z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+										ring
+									);
+							} else if( local_a_nz <= local_x_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, x_scalar, y_scalar, y_zero, false
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_a,
+										already_dense_input_x, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_a, local_x, local_y,
+										z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+										ring
+									);
+							} else {
+								assert( local_x_nz < local_a_nz );
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+								rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+										descr, masked, a_scalar, y_scalar, y_zero, true
+									>(
+										already_dense_output, already_dense_mask, already_dense_input_x,
+										already_dense_input_a, already_dense_input_y,
+										lower_bound, upper_bound,
+										local_z, &local_m, local_x, local_a, local_y,
+										z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+										ring
+									);
+							}
+						}
+					} else {
+						// all that remains is the dense case
+						assert( a_scalar || local_a_nz == local_n );
+						assert( x_scalar || local_x_nz == local_n );
+						assert( y_scalar || local_y_nz == local_n );
+						assert( ! masked || mask_is_dense );
+						assert( local_z_nz == local_n );
+#ifdef _DEBUG
+						std::cout << "\t\t will perform a dense eWiseMulAdd\n";
+#endif
+						if( assign_z ) {
+							rc = dense_eWiseMulAdd<
+									descr, a_scalar, x_scalar, y_scalar, y_zero, true
+								>(
+									lower_bound, upper_bound,
+									z_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						} else {
+							rc = dense_eWiseMulAdd<
+									descr, a_scalar, x_scalar, y_scalar, y_zero, false
+								>(
+									lower_bound, upper_bound,
+									z_vector, a_wrapper, x_wrapper, y_wrapper,
+									ring
+								);
+						}
+					}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					if( !already_dense_output ) {
+#else
+					if( !already_dense_vectors ) {
+#endif
+						internal::getCoordinates( z_vector ).asyncJoinSubset( local_z,
+							lower_bound, upper_bound );
+					}
+
+					return rc;
+				};
+
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS1_EWISEMULADD_DISPATCH,
+					n, sizeof( OutputType ), dense_descr, true,
+					&z_vector, nullptr, &internal::getCoordinates( z_vector ), nullptr,
+					masked ? m_vector : nullptr, a_wrapper.getPointer(),
+					x_wrapper.getPointer(), y_wrapper.getPointer(),
+					masked ? &internal::getCoordinates( *m_vector ) : nullptr,
+					a_wrapper.getCoordinates(), x_wrapper.getCoordinates(),
+					y_wrapper.getCoordinates(),
+					nullptr
+				);
+
+#ifdef _NONBLOCKING_DEBUG
+			std::cout << "\t\tStage added to a pipeline: eWiseMulAdd_dispatch"
+				<< std::endl;
+#endif
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				!grb::is_object< InputType3 >::value &&
+				grb::is_semiring< Ring >::value,
+			void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, true, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+		if( chi == zeroIT2 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, true, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatches
+		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+		if( beta == zeroIT2 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, true, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const Vector< bool, nonblocking, Coords > * null_mask = nullptr;
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, true, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z,  null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + vector) "
+			<< "precomputes scalar multiply and dispatches to eWiseAdd (nonblocking, "
+			<< "vector <- scalar + vector)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#else
+		assert( rc == SUCCESS );
+#endif
+		return eWiseAdd< descr >( z, mul_result, y, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + scalar) "
+			<< "precomputes scalar operations and dispatches to set (nonblocking)\n";
+#endif
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		assert( rc == SUCCESS );
+		typename Ring::D4 add_result;
+		rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		assert( rc == SUCCESS );
+		return grb::foldl< descr >( z, add_result, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		(void) ring;
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand vector a with an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, false, false, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, true, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const InputType2 chi,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial cases
+		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+		if( chi == zeroIT2 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, true, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, x, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, beta, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( a ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatch
+		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+		if( zeroIT2 == beta ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (nonblocking, masked, vector<-vector<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, true, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand scalar alpha of an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, alpha, x, gamma, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// catch trivial dispatch
+		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+		if( alpha == zeroIT1 ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (nonblocking, masked, vector<-scalar<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+		}
+
+		const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, true, false, true, y_zero,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &a,
+		const Vector< InputType2, nonblocking, Coords > &x,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a left-hand vector a with an element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
+			"called with a right-hand vector x with an element type that does not "
+			"match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
+			"called with an additive vector y with an element type that does not "
+			"match the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a result vector z with an element type that does not match "
+			"the fourth domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector m with a non-bool element type" );
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n || size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+		const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+		const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+		// sparse or dense case
+		return internal::eWiseMulAdd_dispatch<
+				descr, true, false, false, false, false,
+				bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+			>( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Vector< InputType3, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value &&
+			!grb::is_object< MaskType >::value, void
+		>::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + vector, "
+			<< "masked) precomputes scalar multiply and dispatches to eWiseAdd "
+			<< "(nonblocking, vector <- scalar + vector, masked)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+		(void) rc;
+#else
+		assert( rc == SUCCESS );
+#endif
+		return grb::eWiseAdd< descr >( z, m, mul_result, y, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename OutputType,
+		typename MaskType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC eWiseMulAdd(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const InputType3 gamma,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"First domain of semiring does not match first input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Second domain of semiring does not match second input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match third input type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+			"Fourth domain of semiring does not match output type" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+		std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + scalar, "
+			<< "masked) precomputes scalar operations and dispatches to foldl "
+			<< "(nonblocking, masked)\n";
+#endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) {
+			return MISMATCH;
+		}
+
+		typename Ring::D3 mul_result;
+		RC rc = grb::apply( mul_result, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( rc == SUCCESS );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		typename Ring::D4 add_result;
+		rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+		assert( rc == SUCCESS );
+#ifdef NDEBUG
+		(void) rc;
+#endif
+		return grb::foldl( z, m, add_result, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring & ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, vector <- vector x vector) dispatches "
+			<< "to eWiseMulAdd (vector <- vector x vector + 0)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, vector <- scalar x vector) dispatches "
+			<< "to eWiseMulAdd (vector <- scalar x vector + 0)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n ) {
+			return MISMATCH;
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking) dispatches to eWiseMulAdd with 0.0 as "
+			<< "additive scalar\n";
+#endif
+
+		return eWiseMulAdd< descr, true >(
+			z, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking) dispatches to scalar apply and foldl\n";
+#endif
+		typename Ring::D3 temp;
+		RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, temp, ring.getAdditiveMonoid(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, vector <- vector x vector, masked) "
+			<< "dispatches to eWiseMulAdd (vector <- vector x vector + 0, masked)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, vector <- scalar x vector, masked) "
+			<< "dispatches to eWiseMulAdd (vector <- scalar x vector + 0, masked)\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, masked) dispatches to masked "
+			<< "eWiseMulAdd with 0.0 as additive scalar\n";
+#endif
+		return eWiseMulAdd< descr, true >(
+			z, m, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, nonblocking, Coords > &z,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) { return MISMATCH; }
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (nonblocking, masked) dispatches to masked foldl\n";
+#endif
+		typename Ring::D3 temp;
+		const RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, m, temp, ring.getAdditiveMonoid(), EXECUTE );
+	}
+
+	// internal namespace for implementation of grb::dot
+	namespace internal {
+
+		template<
+			Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_dot_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+#endif
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, nonblocking, Coords > &x,
+			const Vector< InputType2, nonblocking, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		) {
+#ifdef _DEBUG
+			std::cout << "\t\t in sparse variant, nonzero range " << lower_bound << "--"
+				<< upper_bound << ", blocksize " << AnyOp::blocksize << "\n";
+#else
+			(void) upper_bound;
+#endif
+
+			// get raw alias
+			const InputType1 * __restrict__ a = internal::getRaw( x );
+			const InputType2 * __restrict__ b = internal::getRaw( y );
+
+			size_t i = 0;
+			if( local_nz > 0 ) {
+				while( i + AnyOp::blocksize < local_nz ) {
+					// declare buffers
+					static_assert( AnyOp::blocksize > 0,
+						"Configuration error: vectorisation blocksize set to 0!" );
+					typename AnyOp::D1 xx[ AnyOp::blocksize ];
+					typename AnyOp::D2 yy[ AnyOp::blocksize ];
+					typename AnyOp::D3 zz[ AnyOp::blocksize ];
+					bool mask[ AnyOp::blocksize ];
+
+					// prepare registers
+					for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+						mask[ k ] = already_dense_input_x ||
+							local_x.assigned( already_dense_input_y ? i : local_y.index( i ) );
+					}
+
+					// rewind
+					i -= AnyOp::blocksize;
+
+					// do masked load
+					for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+						if( mask[ k ] ) {
+							xx[ k ] = static_cast< typename AnyOp::D1 >(
+								a[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+							yy[ k ] = static_cast< typename AnyOp::D2 >(
+								b[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+						}
+					}
+
+					// perform element-wise multiplication
+					if( internal::maybe_noop< AnyOp >::value ) {
+						// we are forced to first initialise zz before doing masked apply
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+						}
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							if( mask[ k ] ) {
+								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED        // yy and xx cannot be used
+										                           // uninitialised or mask
+								apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); // would be false while zz
+								GRB_UTIL_RESTORE_WARNINGS                  // init is just above
+							}
+						}
+					} else {
+						// if apply surely initialises zz, we could use a blend-like op
+						for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+							if( mask[ k ] ) {
+								apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+							} else {
+								zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+							}
+						}
+					}
+
+					// perform reduction into output element
+					addMonoid.getOperator().foldlArray( thread_local_output, zz,
+						AnyOp::blocksize );
+					//^--> note that this foldl operates on raw arrays,
+					//     and thus should not be mistaken with a foldl
+					//     on a grb::Vector.
+				}
+
+				// perform element-by-element updates for remainder (if any)
+				for( ; i < local_nz; ++i ) {
+					typename AddMonoid::D3 temp =
+						addMonoid.template getIdentity< typename AddMonoid::D3 >();
+					const size_t index = ( already_dense_input_y ? i : local_y.index( i ) ) +
+						lower_bound;
+					if( already_dense_input_x || local_x.assigned( index - lower_bound ) ) {
+						apply( temp, a[ index ], b[ index ], anyOp );
+						foldr( temp, thread_local_output, addMonoid.getOperator() );
+					}
+				}
+			}
+
+			return SUCCESS;
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			class AddMonoid,
+			class AnyOp,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dot_generic(
+			OutputType &z,
+			const Vector< InputType1, nonblocking, Coords > &x,
+			const Vector< InputType2, nonblocking, Coords > &y,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp,
+			const Phase &phase
+		) {
+			const size_t n = internal::getCoordinates( x ).size();
+
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+			assert( phase == EXECUTE );
+
+			RC ret = SUCCESS;
+
+			const size_t start = 0;
+			const size_t end = n;
+
+			if( end > start ) {
+
+				typename AddMonoid::D3 reduced =
+					addMonoid.template getIdentity< typename AddMonoid::D3 >();
+
+				size_t reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+					config::CACHE_LINE_SIZE::value();
+				typename AddMonoid::D3 array_reduced[ reduced_size ];
+
+				for(
+					size_t i = 0;
+					i < reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					array_reduced[ i ] =
+						addMonoid.template getIdentity< typename AddMonoid::D3 >();
+				}
+
+				constexpr const bool dense_descr = descr & descriptors::dense;
+
+				internal::Pipeline::stage_type func =
+					[&x, &y, &addMonoid, &anyOp, &array_reduced] (
+						internal::Pipeline &pipeline,
+						const size_t lower_bound, const size_t upper_bound
+					) {
+#ifdef _NONBLOCKING_DEBUG
+						#pragma omp critical
+						std::cout << "\t\tExecution of stage dot-generic in the range("
+							<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+						RC rc = SUCCESS;
+
+						Coords local_x, local_y;
+						const size_t local_n = upper_bound - lower_bound;
+						size_t local_x_nz = local_n;
+						size_t local_y_nz = local_n;
+						bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						const bool already_dense_vectors = dense_descr ||
+							pipeline.allAlreadyDenseVectors();
+#else
+						(void) pipeline;
+						constexpr const bool already_dense_vectors = dense_descr;
+#endif
+						bool already_dense_input_x = true;
+						bool already_dense_input_y = true;
+
+						if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							already_dense_input_x = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( x ) );
+							if( !already_dense_input_x ) {
+#else
+								already_dense_input_x = false;
+#endif
+								local_x = internal::getCoordinates( x ).asyncSubset(
+									lower_bound, upper_bound );
+								local_x_nz = local_x.nonzeroes();
+								if( local_x_nz < local_n ) {
+									sparse = true;
+								}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+
+							already_dense_input_y = pipeline.containsAlreadyDenseVector(
+								&internal::getCoordinates( y ) );
+							if( !already_dense_input_y ) {
+#else
+								already_dense_input_y = false;
+#endif
+								local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+									upper_bound );
+								local_y_nz = local_y.nonzeroes();
+								if( local_y_nz < local_n ) {
+									sparse = true;
+								}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+							}
+#endif
+						}
+
+						unsigned int thread_id =
+							omp_get_thread_num() * config::CACHE_LINE_SIZE::value();
+
+						if( sparse ) {
+							if( local_x_nz < local_y_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+								rc = internal::sparse_dot_generic<
+#endif
+										descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+									>(
+										already_dense_input_x, already_dense_input_y,
+										array_reduced[ thread_id ],
+										lower_bound, upper_bound,
+										local_x, local_y,
+										x, y,
+										local_x_nz,
+										addMonoid, anyOp
+									 );
+							} else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+								rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+								rc = internal::sparse_dot_generic<
+#endif
+										descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+									>(
+										already_dense_input_y, already_dense_input_x,
+										array_reduced[ thread_id ],
+										lower_bound, upper_bound,
+										local_y, local_x, x, y, local_y_nz,
+										addMonoid, anyOp
+									);
+							}
+						} else {
+							// get raw alias
+							const InputType1 * __restrict__ a = internal::getRaw( x );
+							const InputType2 * __restrict__ b = internal::getRaw( y );
+
+							size_t i = lower_bound;
+							if( upper_bound > lower_bound ) {
+								while( i + AnyOp::blocksize < upper_bound ) {
+									// declare buffers
+									static_assert( AnyOp::blocksize > 0,
+										"Configuration error: vectorisation blocksize set to 0!" );
+
+									typename AnyOp::D1 xx[ AnyOp::blocksize ];
+									typename AnyOp::D2 yy[ AnyOp::blocksize ];
+									typename AnyOp::D3 zz[ AnyOp::blocksize ];
+
+									// prepare registers
+									for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+										xx[ k ] = static_cast< typename AnyOp::D1 >( a[ i ] );
+										yy[ k ] = static_cast< typename AnyOp::D2 >( b[ i++ ] );
+									}
+
+									// perform element-wise multiplication
+									if( internal::maybe_noop< AnyOp >::value ) {
+										for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+											zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+										}
+									}
+									for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+										apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+									}
+
+									// perform reduction into output element
+									addMonoid.getOperator().foldlArray( array_reduced[ thread_id ], zz,
+										AnyOp::blocksize );
+									//^--> note that this foldl operates on raw arrays,
+									//     and thus should not be mistaken with a foldl
+									//     on a grb::Vector.
+#ifdef _DEBUG
+									std::cout << "\t\t " << ( i - AnyOp::blocksize ) << "--" << i << ": "
+										<< "running reduction = " << array_reduced[ thread_id ] << "\n";
+#endif
+								}
+
+								// perform element-by-element updates for remainder (if any)
+								for( ; i < upper_bound; ++i ) {
+									OutputType temp = addMonoid.template getIdentity< OutputType >();
+									apply( temp, a[ i ], b[ i ], anyOp );
+									foldr( temp, array_reduced[ thread_id ], addMonoid.getOperator() );
+								}
+							}
+						}
+
+						// the local coordinates for the input vectors have not been updated as
+						// they are read-only therefore, we don't need to invoke asyncJoinSubset;
+						// the output is a scalar
+						return rc;
+					};
+
+#ifdef _NONBLOCKING_DEBUG
+				std::cout << "\t\tStage added to a pipeline: dot-generic" << std::endl;
+#endif
+
+				ret = ret ? ret : internal::le.addStage(
+						std::move( func ),
+						internal::Opcode::BLAS1_DOT_GENERIC,
+						end, sizeof( OutputType ), dense_descr, true,
+						nullptr, nullptr, nullptr, nullptr,
+						&x, &y, nullptr, nullptr,
+						&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+						nullptr, nullptr,
+						nullptr
+					);
+
+				for(
+					size_t i = 0;
+					i < reduced_size;
+					i += config::CACHE_LINE_SIZE::value()
+				) {
+					foldl( reduced, array_reduced[ i ], addMonoid.getOperator() );
+				}
+
+				// write back result
+				z = static_cast< OutputType >( reduced );
+			} else {
+				// this has been tested by the unittest
+			}
+
+#ifdef _DEBUG
+			std::cout << "\t returning " << z << "\n";
+#endif
+			// done!
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AddMonoid,
+		class AnyOp,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC dot(
+		OutputType &z,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const AddMonoid &addMonoid = AddMonoid(),
+		const AnyOp &anyOp = AnyOp(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< AddMonoid >::value &&
+			grb::is_operator< AnyOp >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType1, typename AnyOp::D1 >::value ), "grb::dot",
+			"called with a left-hand vector value type that does not match the first "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType2, typename AnyOp::D2 >::value ), "grb::dot",
+			"called with a right-hand vector value type that does not match the second "
+			"domain of the given multiplicative operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ),
+			"grb::dot",
+			"called with a multiplicative operator output domain that does not match "
+			"the first domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, typename AddMonoid::D2 >::value ), "grb::dot",
+			"called with an output vector value type that does not match the second "
+			"domain of the given additive operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ),
+			"grb::dot",
+			"called with an additive operator whose output domain does not match its "
+			"second input domain" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, typename AddMonoid::D3 >::value ), "grb::dot",
+			"called with an output vector value type that does not match the third "
+			"domain of the given additive operator" );
+
+#ifdef _DEBUG
+		std::cout << "In grb::dot (nonblocking). "
+			<< "I/O scalar on input reads " << z << "\n";
+#endif
+
+		// dynamic sanity check
+		const size_t n = internal::getCoordinates( y ).size();
+		if( internal::getCoordinates( x ).size() != n ) {
+			return MISMATCH;
+		}
+
+#ifdef _DEBUG
+		std::cout << "\t dynamic checks pass\n";
+#endif
+
+		// dot will be computed out-of-place here. A separate field is needed because
+		// of possible multi-threaded computation of the dot.
+		OutputType oop = addMonoid.template getIdentity< OutputType >();
+
+		RC ret = SUCCESS;
+
+		ret = internal::dot_generic< descr >( oop, x, y, addMonoid, anyOp, phase );
+
+		// fold out-of-place dot product into existing input, and exit
+#ifdef _DEBUG
+		std::cout << "\t dot_generic returned " << oop << ", "
+			<< "which will be folded into " << z << " "
+			<< "using the additive monoid\n";
+#endif
+		ret = ret ? ret : foldl( z, oop, addMonoid.getOperator() );
+#ifdef _DEBUG
+		std::cout << "\t returning " << z << "\n";
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC dot(
+		IOType &x,
+		const Vector< InputType1, nonblocking, Coords > &left,
+		const Vector< InputType2, nonblocking, Coords > &right,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< IOType >::value &&
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::dot (nonblocking, semiring version)\n"
+			<< "\t dispatches to monoid-operator version\n";
+#endif
+		return grb::dot< descr >( x, left, right, ring.getAdditiveMonoid(),
+			ring.getMultiplicativeOperator(), phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType,
+		typename Coords
+	>
+	RC eWiseMap( const Func f, Vector< DataType, nonblocking, Coords > &x ) {
+
+		RC ret = SUCCESS;
+
+		const size_t n = internal::getCoordinates( x ).size();
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [f, &x] (
+			internal::Pipeline &pipeline, const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseMap(f, x) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+			bool sparse = false;
+
+			bool already_dense_input_x = true;
+
+			if( !dense_descr ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input_x = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_input_x ) {
+#else
+					already_dense_input_x = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( sparse ) {
+				// the sparse case is possible only when the local coordinates are already
+				// initialized
+				assert( already_dense_input_x == false );
+				for( size_t k = 0; k < local_x_nz; ++k ) {
+					DataType &xval = internal::getRaw( x )[ local_x.index( k ) + lower_bound ];
+					xval = f( xval );
+				}
+			} else {
+				for( size_t i = lower_bound; i < upper_bound; ++i ) {
+					DataType &xval = internal::getRaw( x )[ i ];
+					xval = f( xval );
+				}
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISEMAP,
+				n, sizeof( DataType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseMap(f, x)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename Func,
+			typename DataType1,
+			typename DataType2,
+			typename Coords,
+			typename... Args
+		>
+		RC eWiseLambda_helper(
+			std::vector< const void * > all_vectors_ptr,
+			size_t maximum_data_type_size,
+			const Func f,
+			const Vector< DataType1, nonblocking, Coords > &x,
+			const Vector< DataType2, nonblocking, Coords > &y,
+			Args const &... args
+		) {
+			// catch mismatch
+			if( size( x ) != size( y ) ) {
+				return MISMATCH;
+			}
+
+			all_vectors_ptr.push_back( &y );
+			maximum_data_type_size = std::max( maximum_data_type_size, sizeof( DataType2 ) );
+
+			// continue
+			return eWiseLambda_helper( all_vectors_ptr, maximum_data_type_size, f, x,
+				args... );
+		}
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			typename Func,
+			typename DataType,
+			typename Coords
+		>
+		RC eWiseLambda_helper(
+			std::vector< const void * > all_vectors_ptr,
+			size_t maximum_data_type_size,
+			const Func f,
+			const Vector< DataType, nonblocking, Coords > &x
+		) {
+			// all pointers, except one, have been stored, and the last one will be
+			// stored by the normal eWiseLambda
+			return eWiseLambda< descr, Func, DataType, Coords >( f, x, all_vectors_ptr,
+				maximum_data_type_size );
+		}
+	};
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType1,
+		typename DataType2,
+		typename Coords,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Vector< DataType1, nonblocking, Coords > &x,
+		const Vector< DataType2, nonblocking, Coords > &y,
+		Args const &... args
+	) {
+
+		// create an empty vector to store pointers for all vectors passed to
+		// eWiseLambda
+		std::vector< const void * > all_vectors_ptr;
+
+		// invoke the helper function to store the pointers
+		return internal::eWiseLambda_helper( all_vectors_ptr, 0, f, x, y, args...);
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename Func,
+		typename DataType,
+		typename Coords
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Vector< DataType, nonblocking, Coords > &x,
+		std::vector< const void * > all_vectors_ptr = std::vector< const void *>(),
+		size_t maximum_data_type_size = 0
+	) {
+#ifdef _DEBUG
+		std::cout << "Info: entering eWiseLambda function on vectors.\n";
+#endif
+
+		all_vectors_ptr.push_back( &x );
+		maximum_data_type_size =
+			std::max( maximum_data_type_size, sizeof( DataType ) );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [f, &x] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage eWiseLambda in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			Coords local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_output = true;
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_output = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+#else
+					already_dense_output = false;
+#endif
+					local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+					local_x_nz = local_x.nonzeroes();
+					if( local_x_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( sparse ) {
+				if ( already_dense_output ) {
+					for( size_t k = 0; k < local_x_nz; ++k ) {
+						f( k + lower_bound );
+					}
+				} else {
+					for( size_t k = 0; k < local_x_nz; ++k ) {
+						const size_t i = local_x.index( k ) + lower_bound;
+						f( i );
+					}
+				}
+			} else {
+				for (size_t i = lower_bound; i < upper_bound; i++) {
+					f( i );
+				}
+			}
+
+			// the local coordinates for all vectors of eWiseLambda cannot change
+			// therefore, we don't need to invoke asyncJoinSubset for any of them
+
+			return SUCCESS;
+		};
+
+		// eWiseLambda is a special case as we don't know which of the accessed
+		// vectors are read-only therefore, we assume that all vectors may be written,
+		// but the sparsity structure cannot change i.e., the coordinates of each
+		// vector cannot be updated, but we pass the coordinates of x for the loop
+		// size
+		ret = ret ? ret : internal::le.addeWiseLambdaStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_EWISELAMBDA,
+				internal::getCoordinates( x ).size(), maximum_data_type_size, dense_descr,
+				all_vectors_ptr, &internal::getCoordinates( x )
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: eWiseLambda" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename InputType,
+		typename IOType,
+		typename MaskType,
+		typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if< !grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+			<< "It has a mask of size " << size( mask ) << " with " << nnz( mask )
+			<< " nonzeroes.\n";
+#endif
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::foldl",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldl",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldl",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::foldl",
+			"called with a vector mask type that is not boolean" );
+
+		if( size( mask ) > 0 ) {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, true, true
+				>( x, y, mask, monoid );
+		} else {
+			return internal::template fold_from_vector_to_scalar_generic<
+					descr, false, true
+				>( x, y, mask, monoid );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename IOType,
+		typename InputType,
+		typename Coords
+	>
+	RC foldl(
+		IOType &x,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Monoid &monoid = Monoid(),
+		const typename std::enable_if<
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType >::value &&
+			grb::is_monoid< Monoid >::value, void
+		>::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+			<< "It has no mask.\n";
+#endif
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< IOType, InputType >::value ), "grb::reduce",
+			"called with a scalar IO type that does not match the input vector type" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D1 >::value ), "grb::reduce",
+			"called with an input vector value type that does not match the first "
+			"domain of the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D2 >::value ), "grb::reduce",
+			"called with an input vector type that does not match the second domain of "
+			"the given monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< InputType, typename Monoid::D3 >::value ), "grb::reduce",
+			"called with an input vector type that does not match the third domain of "
+			"the given monoid" );
+
+		// do reduction
+		Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return internal::template fold_from_vector_to_scalar_generic<
+				descr, false, true
+			>( x, y, empty_mask, monoid );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T,
+		typename U,
+		typename Coords
+	>
+	RC zip(
+		Vector< std::pair< T, U >, nonblocking, Coords > &z,
+		const Vector< T, nonblocking, Coords > &x,
+		const Vector< U, nonblocking, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const size_t n = size( z );
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		const T * const x_raw = internal::getRaw( x );
+		const U * const y_raw = internal::getRaw( y );
+		std::pair< T, U > * z_raw = internal::getRaw( z );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&z, x_raw, y_raw, z_raw] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			std::cout << "\t\tExecution of stage zip(z, x, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			bool already_dense_output = true;
+#else
+			(void) pipeline;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( z ) );
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+					upper_bound );
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				// the result will always be dense
+				local_z.local_assignAllNotAlreadyAssigned();
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; ++i ) {
+				z_raw[ i ].first = x_raw[ i ];
+				z_raw[ i ].second = y_raw[ i ];
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_ZIP,
+				internal::getCoordinates( x ).size(), sizeof( T ) + sizeof( U ),
+				dense_descr, true,
+				&z, nullptr, &internal::getCoordinates( z ), nullptr,
+				&x, &y, nullptr, nullptr,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: zip(z, x, y)" << std::endl;
+#endif
+		return SUCCESS;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename T,
+		typename U,
+		typename Coords
+	>
+	RC unzip(
+		Vector< T, nonblocking, Coords > &x,
+		Vector< U, nonblocking, Coords > &y,
+		const Vector< std::pair< T, U >, nonblocking, Coords > &in,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< T >::value &&
+			!grb::is_object< U >::value,
+		void >::type * const = nullptr
+	) {
+		const size_t n = size( in );
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n != size( y ) ) {
+			return MISMATCH;
+		}
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		T * const x_raw = internal::getRaw( x );
+		U * const y_raw = internal::getRaw( y );
+		const std::pair< T, U > * in_raw = internal::getRaw( in );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, &y, x_raw, y_raw, in_raw] (
+			internal::Pipeline &pipeline,
+			const size_t lower_bound, const size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage unzip(x, y, in) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y;
+
+			bool already_dense_output_x = true;
+			bool already_dense_output_y = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output_x = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( x ) );
+			if( !dense_descr && !already_dense_output_x ) {
+#else
+			if( !dense_descr ) {
+				already_dense_output_x = false;
+#endif
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x.local_assignAllNotAlreadyAssigned();
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_output_y = pipeline.containsAlreadyDenseVector(
+				&internal::getCoordinates( y ) );
+			if( !dense_descr && !already_dense_output_y ) {
+#else
+			if( !dense_descr ) {
+				already_dense_output_y = false;
+#endif
+				local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+					upper_bound );
+				local_y.local_assignAllNotAlreadyAssigned();
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; ++i ) {
+				x_raw[ i ] = in_raw[ i ].first;
+				y_raw[ i ] = in_raw[ i ].second;
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output_x ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( !dense_descr && !already_dense_output_y ) {
+#else
+			if( !dense_descr ) {
+#endif
+				internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::BLAS1_UNZIP,
+				internal::getCoordinates( x ).size(), std::max( sizeof( T ), sizeof( U ) ),
+				dense_descr, true,
+				&x, &y,
+				&internal::getCoordinates( x ), &internal::getCoordinates( y ),
+				&in, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( in ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: unzip(x, y, in)" << std::endl;
+#endif
+		return SUCCESS;
+	}
+
+/** @} */
+//   ^-- ends BLAS-1 NB module
+
+} // end namespace ``grb''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end `_H_GRB_NONBLOCKING_BLAS1'
+
diff --git a/include/graphblas/nonblocking/blas2.hpp b/include/graphblas/nonblocking/blas2.hpp
new file mode 100644
index 000000000..47501eacd
--- /dev/null
+++ b/include/graphblas/nonblocking/blas2.hpp
@@ -0,0 +1,1559 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the nonblocking level-2 primitives
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS2
+#define _H_GRB_NONBLOCKING_BLAS2
+
+#include <limits>
+#include <algorithm>
+#include <type_traits>
+
+#include <graphblas/base/blas2.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+
+#include <graphblas/reference/compressed_storage.hpp>
+
+#include "coordinates.hpp"
+#include "forward.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "boolean_dispatcher_blas2.hpp"
+
+#ifdef _DEBUG
+#include "spmd.hpp"
+#endif
+
+#define NO_CAST_ASSERT( x, y, z )                                          \
+	static_assert( x,                                                      \
+		"\n\n"                                                             \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n"                         \
+		"*     ERROR      | " y " " z ".\n"                                \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n"                         \
+		"* Possible fix 1 | Remove no_casting from the template "          \
+		"parameters in this call to " y ".\n"                              \
+		"* Possible fix 2 | Provide objects with element types or "        \
+		"domains that match the expected type.\n"                          \
+		"****************************************************************" \
+		"****************************************************************" \
+		"**************************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+	}
+}
+
+namespace grb {
+
+	/**
+	 * \addtogroup nonblocking
+	 * @{
+	 */
+
+	// put the generic mxv implementation in an internal namespace
+	namespace internal {
+
+		template<
+			bool output_dense,
+			bool left_handed,
+			class AdditiveMonoid,
+			class Multiplication,
+			template< typename > class One,
+			typename IOType,
+			typename InputType,
+			typename SourceType,
+			typename Coords
+		>
+		class addIdentityDuringMV<
+			nonblocking, true, output_dense, left_handed,
+			AdditiveMonoid, Multiplication, One,
+			IOType, InputType, SourceType, Coords
+		> {
+
+			public:
+
+				static void apply(
+					Vector< IOType, nonblocking, Coords > &destination_vector,
+					IOType * __restrict__ const &destination,
+					const size_t &destination_range,
+					const size_t &source_index,
+					const AdditiveMonoid &add,
+					const Multiplication &mul,
+					const SourceType &input_element,
+					const std::function< size_t( size_t ) > &src_local_to_global,
+					const std::function< size_t( size_t ) > &dst_global_to_local
+				) {
+
+				}
+		};
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+#endif
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void vxm_inner_kernel_gather(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+#endif
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, nonblocking, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, nonblocking, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, nonblocking, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		) {
+#ifndef _DEBUG
+			(void) destination_vector;
+#endif
+			constexpr bool add_identity = descr & descriptors::add_identity;
+			constexpr bool dense_hint = descr & descriptors::dense;
+			constexpr bool explicit_zero = descr & descriptors::explicit_zero;
+#ifdef _DEBUG
+			constexpr bool use_index = descr & descriptors::use_index;
+#endif
+			assert( rc == SUCCESS );
+
+			// check whether we should compute output here
+			if( masked ) {
+				if( already_dense_mask_vector ) {
+					if( !internal::getCoordinates( mask_vector ).template
+						mask< descr >( destination_index, mask )
+					) {
+#ifdef _DEBUG
+						std::cout << "Masks says to skip processing destination index " <<
+							destination_index << "\n";
+#endif
+						return;
+					}
+				} else {
+					if( !local_mask_vector.template
+						mask< descr >( destination_index - lower_bound, mask )
+					) {
+#ifdef _DEBUG
+						std::cout << "Masks says to skip processing destination index " <<
+							destination_index << "\n";
+#endif
+						return;
+					}
+				}
+			}
+
+			// take shortcut, if possible
+			if( grb::has_immutable_nonzeroes< AdditiveMonoid >::value && (
+					already_dense_destination_vector ||
+					local_destination_vector.assigned( destination_index - lower_bound )
+				) && destination_element != add.template getIdentity< IOType >()
+			) {
+				return;
+			}
+
+			// start output
+			typename AdditiveMonoid::D3 output =
+				add.template getIdentity< typename AdditiveMonoid::D3 >();
+			bool set = false;
+
+			// if we need to add identity, do so first:
+			if( add_identity ) {
+				const size_t id_location = src_global_to_local( dst_local_to_global(
+					destination_index ) );
+				// the SpMV primitive may access non-local elements, and thus referring to
+				// the input vector by using local coordinates is incorrect
+				// the input vector of an SpMV cannot be updated, i.e., written, by another
+				// primitive executed in the same pipeline with the current SpMV
+				// therefore, in the current design, it's safe to use global coordinates for
+				// the input vector
+				if( ( !input_masked ||
+						internal::getCoordinates( source_mask_vector ).template
+							mask< descr >( id_location, source_mask )
+					) && id_location < source_range
+				) {
+					if( dense_hint || internal::getCoordinates( source_vector ).assigned( id_location ) ) {
+						typename AdditiveMonoid::D1 temp;
+						internal::CopyOrApplyWithIdentity<
+								!left_handed, typename AdditiveMonoid::D1, InputType1, One
+							>::set( temp, source_vector[ id_location ], mul );
+						internal::CopyOrApplyWithIdentity<
+								false, typename AdditiveMonoid::D3, typename AdditiveMonoid::D1,
+								AdditiveMonoid::template Identity
+							>::set( output, temp, add );
+						set = true;
+					}
+				}
+			}
+
+			// handle row or column at destination_index
+			// NOTE: This /em could be parallelised, but will probably only slow things
+			//       down
+#ifdef _DEBUG
+			std::cout << "vxm_gather: processing destination index " << destination_index << " / "
+				<< internal::getCoordinates( destination_vector ).size()
+				<< ". Input matrix has " << ( matrix.col_start[ destination_index + 1 ] -
+					matrix.col_start[ destination_index ] ) << " nonzeroes.\n";
+#endif
+			for(
+				size_t k = matrix.col_start[ destination_index ];
+				rc == SUCCESS &&
+					k < static_cast< size_t >( matrix.col_start[ destination_index + 1 ] );
+				++k
+			) {
+				// declare multiplication output field
+				typename Multiplication::D3 result =
+					add.template getIdentity< typename AdditiveMonoid::D3 >();
+				// get source index
+				const size_t source_index = matrix.row_index[ k ];
+				// check mask
+				if( input_masked &&
+					!internal::getCoordinates( source_mask_vector ).template
+						mask< descr >( source_index, source_mask )
+				) {
+#ifdef _DEBUG
+					std::cout << "\t vxm_gather: skipping source index " << source_index
+						<< " due to input mask\n";
+#endif
+					continue;
+				}
+				// check for sparsity at source
+				if( !dense_hint ) {
+					if( !internal::getCoordinates( source_vector ).assigned( source_index ) ) {
+#ifdef _DEBUG
+						std::cout << "\t vxm_gather: Skipping out of computation with source "
+							<< "index " << source_index << " since it does not contain a nonzero\n";
+#endif
+						continue;
+					}
+				}
+				// get nonzero
+				typedef typename std::conditional<
+					left_handed,
+					typename Multiplication::D2,
+					typename Multiplication::D1
+				>::type RingNonzeroType;
+				const RingNonzeroType nonzero =
+					matrix.template getValue( k, One< RingNonzeroType >::value() );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: interpreted nonzero is " << nonzero << ", "
+					<< "which is the " << k << "-th nonzero and has source index "
+					<< source_index << "\n";
+#endif
+				// check if we use source element or whether we use its index value instead
+				typedef typename std::conditional<
+					left_handed,
+					typename Multiplication::D1,
+					typename Multiplication::D2
+				>::type SourceType;
+				const SourceType apply_source = internal::ValueOrIndex<
+					descr, SourceType, InputType1
+				>::getFromArray( source, src_local_to_global, source_index );
+#ifdef _DEBUG
+				if( use_index ) {
+					std::cout << "\t vxm_gather (use_index descriptor): apply( output, matrix "
+						<< "nonzero, vector nonzero, * ) = apply( ";
+				} else {
+					std::cout << "\t vxm_gather: apply( output, matrix nonzero, vector "
+						<< "nonzero, * ) = apply( ";
+				}
+				std::cout << " output, " << nonzero << ", "  << source << ", * )\n";
+#endif
+				//multiply
+				internal::leftOrRightHandedMul<
+					left_handed, typename Multiplication::D3,
+					SourceType, RingNonzeroType, Multiplication
+				>::mul( result, apply_source, nonzero, mul );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: output (this nonzero) = " << result << "\n";
+#endif
+
+				// accumulate
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: foldr( " << result << ", " << output
+					<< ", + );\n";
+#endif
+				rc = foldr( result, output, add.getOperator() );
+#ifdef _DEBUG
+				std::cout << "\t vxm_gather: output (sum at destination) = " << output
+					<< "\n";
+#endif
+				set = true;
+
+				// sanity check (but apply cannot fail)
+				assert( rc == SUCCESS );
+			}
+
+#ifdef _DEBUG
+			if( set ) {
+				std::cout << "\t vxm_gather: local contribution to this output element at "
+					<< "index " << destination_index << " will be " << output << " "
+					<< "and this corresponds to an explicitly set nonzero.\n";
+			} else {
+				std::cout << "\t vxm_gather: local contribution to this output element at "
+					<< "index " << destination_index << " will be " << output << " and this "
+					<< "is an unset value.\n";
+				if( already_dense_destination_vector ||
+					local_destination_vector.assigned( destination_index - lower_bound )
+				) {
+					std::cout << "\t(old value " << destination_element << " will remain "
+						<< "unmodified.)\n";
+				} else {
+					std::cout << "\t(no old value existed so the output vector will remain "
+						<< "unset at this index.)\n";
+				}
+			}
+#endif
+			// finally, accumulate in output
+			if( explicit_zero || set ) {
+#ifdef _DEBUG
+				std::cout << "\taccumulating " << output << " into output vector...\n";
+#endif
+				if( already_dense_destination_vector ||
+					local_destination_vector.assign( destination_index - lower_bound )
+				) {
+#ifdef _DEBUG
+					std::cout << "\tfoldl( " << destination_element << ", " << output << ", "
+					       << "add.getOperator() );, destination_element = ";
+#endif
+					rc = foldl( destination_element, output, add.getOperator() );
+#ifdef _DEBUG
+					std::cout << destination_element << "\n";
+#endif
+				} else {
+#ifdef _DEBUG
+					std::cout << "\toutput vector element was previously not set. Old "
+						<< "(possibly uninitialised value) " << destination_element << " will "
+						<< "now be set to " << output << ", result (after, possibly, casting): ";
+#endif
+					destination_element = static_cast< IOType >( output );
+#ifdef _DEBUG
+					std::cout << destination_element << "\n";
+#endif
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			bool using_semiring,
+			template< typename > class One,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename RIT,
+			typename CIT,
+			typename NIT,
+			typename Coords
+		>
+		RC vxm_generic(
+			Vector< IOType, nonblocking, Coords > &u,
+			const Vector< InputType3, nonblocking, Coords > &mask,
+			const Vector< InputType1, nonblocking, Coords > &v,
+			const Vector< InputType4, nonblocking, Coords > &v_mask,
+			const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const Phase &phase,
+			const std::function< size_t( size_t ) > row_l2g,
+			const std::function< size_t( size_t ) > row_g2l,
+			const std::function< size_t( size_t ) > col_l2g,
+			const std::function< size_t( size_t ) > col_g2l
+		) {
+			// type sanity checking
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					std::is_same< InputType3, bool >::value
+				), "vxm (any variant)",
+				"Mask type is not boolean" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					!left_handed ||
+					std::is_same< InputType1, typename Multiplication::D1 >::value
+				), "vxm (any variant)",
+				"Input vector type does not match multiplicative operator first "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					left_handed ||
+					std::is_same< InputType2, typename Multiplication::D1 >::value
+				), "vxm (any variant)",
+				"Input vector type does not match multiplicative operator second "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					!left_handed ||
+					std::is_same< InputType2, typename Multiplication::D2 >::value
+				), "vxm (any variant)",
+				"Input matrix type does not match multiplicative operator second "
+				"input domain" );
+			NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+					!(descr & descriptors::no_casting) ||
+					left_handed ||
+					std::is_same< InputType1, typename Multiplication::D2 >::value
+				), "vxm (any variant)",
+				"Input matrix type does not match multiplicative operator first "
+				"input domain" );
+
+			RC ret = SUCCESS;
+
+#ifdef _DEBUG
+			const auto s = spmd< nonblocking >::pid();
+			std::cout << s << ": nonblocking vxm called with a "
+				<< descriptors::toString( descr ) << "\n";
+#endif
+
+			// get input and output vector sizes
+			const size_t m = internal::getCoordinates( u ).size();
+			const size_t n = internal::getCoordinates( v ).size();
+
+			// get whether the matrix should be transposed prior to execution of this
+			// vector-times-matrix operation
+			constexpr bool transposed = descr & descriptors::transpose_matrix;
+
+			// check for dimension mismatch
+			if( ( transposed && ( n != ncols( A ) || m != nrows( A ) ) )
+				|| ( !transposed && ( n != nrows( A ) || m != ncols( A ) ) ) ) {
+#ifdef _DEBUG
+				std::cout << "Mismatch of columns ( " << n << " vs. " << ncols( A )
+					<< " ) or rows ( " << m << " vs. " << nrows( A ) << " ) with "
+					<< "transposed value " << ((int)transposed) << "\n";
+#endif
+				return MISMATCH;
+			}
+
+			// check density
+			if( descr & descriptors::dense ) {
+				// it's safe to check the number of nonzeroes for the input vector and its
+				// mask since both of them are read-only in the current design for
+				// nonblocking execution
+				if( nnz( v ) < size( v ) ) {
+#ifdef _DEBUG
+					std::cout << "\t Dense descriptor given but input vector was sparse\n";
+#endif
+					return ILLEGAL;
+				}
+				if( size( v_mask ) > 0 && nnz( v_mask ) < size( v_mask ) ) {
+#ifdef _DEBUG
+					std::cout << "\t Dense descriptor given but input mask has sparse "
+						<< "structure\n";
+#endif
+					return ILLEGAL;
+				}
+			}
+
+			// check mask
+			if( masked ) {
+				if( (transposed && internal::getCoordinates( mask ).size() != nrows( A ) ) ||
+					( !transposed && internal::getCoordinates( mask ).size() != ncols( A ) )
+				) {
+#ifdef _DEBUG
+					std::cout << "Mismatch of mask size ( "
+						<< internal::getCoordinates( mask ).size() << " ) versus matrix rows "
+						<< "or columns ( " << nrows( A ) << " or " << ncols( A ) << " with "
+						<< "transposed value " << ((int)transposed) << "\n";
+#endif
+					return MISMATCH;
+				}
+			}
+
+			// handle resize phase
+			if( phase == RESIZE ) {
+				return SUCCESS;
+			}
+
+			// get raw pointers
+			assert( phase == EXECUTE );
+			const InputType1 * __restrict__ const x = internal::getRaw( v );
+			const InputType3 * __restrict__ const z = internal::getRaw( mask );
+			const InputType4 * __restrict__ const vm = internal::getRaw( v_mask );
+			IOType * __restrict__ const y = internal::getRaw( u );
+
+			// check for illegal arguments
+			if( !(descr & descriptors::safe_overlap) &&
+				reinterpret_cast< const void * >( y ) ==
+					reinterpret_cast< const void * >( x )
+			) {
+				std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+					<< "input and output vectors.\n";
+				return OVERLAP;
+			}
+			if( masked && (reinterpret_cast<const void*>(y) ==
+				reinterpret_cast<const void*>(z))
+			) {
+				std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+					<< "mask and output vectors.\n";
+				return OVERLAP;
+			}
+
+#ifdef _DEBUG
+			std::cout << s << ": performing SpMV / SpMSpV using an " << nrows( A )
+				<< " by " << ncols( A ) << " matrix holding " << nnz( A )
+				<< " nonzeroes.\n";
+#endif
+
+			// in the current design for nonblocking execution, the input vectors of
+			// vxm_generic // cannot be overwritten by another stage of the same
+			// pipeline, and therefore, it's safe to rely on the global coordinates of
+			// the input vectors, as they are read-only this property is of special
+			// importance when handling matrices of size "m" x "n" since the mismatch
+			// between "m" and "n" requires special handling for the local coordinates of
+			// the input vectors, the current design relies on the size of the output
+			// vector which should match the sizes of all other vectors in the pipeline
+			// the size of the input vector does not have to match the size of the other
+			// vectors as long as the input vectors are read-only
+
+			constexpr const bool dense_descr = descr & descriptors::dense;
+
+			internal::Pipeline::stage_type func = [
+				&u, &mask, &v, &v_mask, &A, &add, &mul,
+				row_l2g, row_g2l, col_l2g, col_g2l,
+				y, x, z, vm
+#ifdef _DEBUG
+				, s
+#endif
+			] (
+				internal::Pipeline &pipeline,
+				const size_t lower_bound, const size_t upper_bound
+			) {
+#ifdef _NONBLOCKING_DEBUG
+				#pragma omp critical
+				std::cout << "\t\tExecution of stage vxm_generic in the range("
+					<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+				(void) pipeline;
+
+				RC rc = SUCCESS;
+
+				Coords local_u, local_mask;
+				const size_t local_n = upper_bound - lower_bound;
+				size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				const bool already_dense_vectors = dense_descr ||
+					pipeline.allAlreadyDenseVectors();
+#else
+				constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+				bool already_dense_output = true;
+				bool already_dense_output_mask = true;
+
+				if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					already_dense_output = pipeline.containsAlreadyDenseVector(
+						&internal::getCoordinates( u ) );
+					if( !already_dense_output ) {
+#else
+						already_dense_output = false;
+#endif
+						local_u = internal::getCoordinates( u ).asyncSubset( lower_bound,
+							upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					}
+#endif
+					if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						already_dense_output_mask = pipeline.containsAlreadyDenseVector(
+							&internal::getCoordinates( mask ) );
+						if( !already_dense_output_mask ) {
+#else
+							already_dense_output_mask = false;
+#endif
+							local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+								upper_bound );
+							local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						}
+#endif
+					}
+				}
+
+				// check if transpose is required
+				if( descr & descriptors::transpose_matrix ) {
+					// start compute u=vA^T
+#ifdef _DEBUG
+					std::cout << s << ": in u=vA^T=Av variant\n";
+#endif
+
+					// start u=vA^T using CRS
+					// matrix = &(A.CRS);
+					// TODO internal issue #193
+					if( !masked || (descr & descriptors::invert_mask) ) {
+						// loop over all columns of the input matrix (can be done in parallel):
+#ifdef _DEBUG
+						std::cout << s << ": in full CRS variant (gather)\n";
+#endif
+
+						for( size_t i = lower_bound; i < upper_bound; i++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, col_l2g, col_g2l
+								);
+						}
+
+					} else {
+#ifdef _DEBUG
+						std::cout << s << ": in masked CRS variant (gather). Mask has "
+							<< local_mask_nz << " nonzeroes and size " << local_n << ":\n";
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							std::cout << " "
+							<< ( ( already_dense_output_mask ? k : local_mask.index( k ) ) +
+								lower_bound );
+						}
+						std::cout << "\n";
+#endif
+						assert( masked );
+
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							const size_t i =
+								( already_dense_output_mask ? k : local_mask.index( k ) ) +
+								lower_bound;
+							assert( i < nrows(A) );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, false, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, col_l2g, col_g2l
+								);
+						}
+					}
+					// end compute u=vA^T
+				} else {
+#ifdef _DEBUG
+					std::cout << s << ": in u=vA=A^Tv variant\n";
+#endif
+					// start u=vA using CCS
+#ifdef _DEBUG
+					std::cout << s << ": in column-major vector times matrix variant (u=vA)\n"
+						<< "\t(this variant relies on the gathering inner kernel)\n";
+#endif
+
+					// if not transposed, then CCS is the data structure to go:
+					// TODO internal issue #193
+					if( !masked || (descr & descriptors::invert_mask) ) {
+#ifdef _DEBUG
+						std::cout << s << ": loop over all input matrix columns\n";
+#endif
+
+						for( size_t j = lower_bound; j < upper_bound; j++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, row_g2l, col_l2g
+								);
+						}
+					} else {
+						// loop only over the nonzero masks (can still be done in parallel!)
+#ifdef _DEBUG
+						std::cout << s << ": loop over mask indices\n";
+#endif
+						assert( masked );
+
+						for( size_t k = 0; k < local_mask_nz; ++k ) {
+							const size_t j =
+								( already_dense_output_mask ? k : local_mask.index( k ) ) + lower_bound;
+#ifdef GRB_BOOLEAN_DISPATCHER
+							boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+							vxm_inner_kernel_gather<
+#endif
+									descr, masked, input_masked, left_handed, One
+								>(
+									already_dense_output, already_dense_output_mask,
+									rc, lower_bound, local_u, local_mask,
+									u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+									mask, z, v_mask, vm, add, mul,
+									row_l2g, row_g2l, col_l2g
+								);
+						}
+					}
+					// end computing u=vA
+				}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( !already_dense_output ) {
+#else
+				if( !already_dense_vectors ) {
+#endif
+					internal::getCoordinates( u ).asyncJoinSubset( local_u, lower_bound,
+						upper_bound );
+				}
+
+				return rc;
+			};
+
+			// since the local coordinates are never used for the input vector and the
+			// input mask they are added only for verification of legal usage of the
+			// dense descriptor
+			ret = ret ? ret : internal::le.addStage(
+					std::move( func ),
+					internal::Opcode::BLAS2_VXM_GENERIC,
+					size( u ), sizeof( IOType ), dense_descr, true,
+					&u, nullptr, &internal::getCoordinates( u ), nullptr,
+					&v,
+					masked ? &mask : nullptr,
+					input_masked ? &v_mask : nullptr,
+					nullptr,
+					&internal::getCoordinates( v ),
+					masked ? &internal::getCoordinates( mask ) : nullptr,
+					input_masked ? &internal::getCoordinates( v_mask ) : nullptr,
+					nullptr,
+					&A
+				);
+
+#ifdef _NONBLOCKING_DEBUG
+			std::cout << "\t\tStage added to a pipeline: vxm_generic" << std::endl;
+#endif
+
+#ifdef _DEBUG
+			std::cout << s << ": exiting SpMV / SpMSpV.\n" << std::flush;
+#endif
+			return ret;
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const grb::Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return vxm< descr, true, false >( u, mask, v, empty_mask, A, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Vector< InputType4, nonblocking, Coords > &v_mask,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		constexpr bool left_sided = true;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+
+			return internal::vxm_generic<
+					descr, true, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( mask ) == 0 && size( v_mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, false, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && size( v_mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, true, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					descr, false, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, ring,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2,
+		typename InputType3 = bool
+	>
+	RC mxv(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class Ring,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Vector< InputType4, nonblocking, Coords > &v_mask,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+		constexpr bool left_sided = false;
+		if( output_may_be_masked && ( size( v_mask ) == 0 && size( mask ) > 0 ) ) {
+
+			return internal::vxm_generic<
+					new_descr, true, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && ( size( mask ) == 0 &&
+			size( v_mask ) > 0 )
+		) {
+			return internal::vxm_generic<
+					new_descr, false, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, true, true, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					new_descr, false, false, left_sided, true, Ring::template One
+				>(
+					u, mask, v, v_mask, A,
+					ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+					phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename Coords,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename IOType = typename Ring::D4,
+		typename InputType1 = typename Ring::D1,
+		typename InputType2 = typename Ring::D2
+	>
+	RC mxv(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Ring &ring,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_semiring< Ring >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, ring,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+		return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, add, mul,
+			phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC vxm(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Vector< InputType4, nonblocking, Coords > &v_mask,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		static_assert( !(descr & descriptors::add_identity), "Cannot add an "
+			"identity if no concept of `one' is known. Suggested fix: use a semiring "
+			"instead." );
+		constexpr bool left_sided = true;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+			return internal::vxm_generic<
+					descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( v_mask ) > 0 && size( mask ) == 0 ) {
+			return internal::vxm_generic<
+					descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		bool output_may_be_masked = true,
+		bool input_may_be_masked = true,
+		class AdditiveMonoid,
+		class MultiplicativeOperator,
+		typename IOType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename InputType4,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename Coords
+	>
+	RC mxv(
+		Vector< IOType, nonblocking, Coords > &u,
+		const Vector< InputType3, nonblocking, Coords > &mask,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< InputType1, nonblocking, Coords > &v,
+		const Vector< InputType4, nonblocking, Coords > &v_mask,
+		const AdditiveMonoid &add = AdditiveMonoid(),
+		const MultiplicativeOperator &mul = MultiplicativeOperator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_monoid< AdditiveMonoid >::value &&
+			grb::is_operator< MultiplicativeOperator >::value &&
+			!grb::is_object< IOType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< InputType3 >::value &&
+			!grb::is_object< InputType4 >::value &&
+			!std::is_same< InputType2, void >::value,
+		void >::type * const = nullptr
+	) {
+		static_assert( !(descr & descriptors::add_identity), "Cannot add an identity "
+			"if no concept of `1' is known. Suggested fix: use a semiring "
+			"instead." );
+		constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+		constexpr bool left_sided = false;
+		if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+			return internal::vxm_generic<
+					new_descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( input_may_be_masked && size( mask ) == 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+			size( v_mask ) > 0
+		) {
+			return internal::vxm_generic<
+					new_descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		} else {
+			assert( size( mask ) == 0 );
+			assert( size( v_mask ) == 0 );
+			return internal::vxm_generic<
+					new_descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+				>(
+					u, mask, v, v_mask, A, add, mul, phase,
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					},
+					[]( const size_t i ) {
+						return i;
+					}
+				);
+		}
+	}
+
+	template<
+		class ActiveDistribution,
+		typename Func,
+		typename DataType,
+		typename RIT,
+		typename CIT,
+		typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A,
+		const size_t s,
+		const size_t P
+	) {
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: eWiseLambda (nonblocking, matrix variant) currently "
+				<< "delegates to a blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return eWiseLambda< ActiveDistribution, Func, DataType, RIT, CIT, NIT >(
+			f, internal::getRefMatrix( A ), s, P );
+	}
+
+	template<
+		typename Func,
+		typename DataType1,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename DataType2,
+		typename Coords,
+		typename... Args
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType1, nonblocking, RIT, CIT, NIT > &A,
+		const Vector< DataType2, nonblocking, Coords > &x,
+		Args... args
+	) {
+		// do size checking
+		if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) {
+			std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+				<< " has nothing to do with either matrix dimension (" << nrows( A )
+				<< " nor " << ncols( A ) << ").\n";
+			return MISMATCH;
+		}
+
+		return eWiseLambda( f, A, args... );
+	}
+
+	/** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end _H_GRB_NONBLOCKING_BLAS2
+
diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
new file mode 100644
index 000000000..02afce1d6
--- /dev/null
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -0,0 +1,595 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the level-3 primitives for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS3
+#define _H_GRB_NONBLOCKING_BLAS3
+
+#include <type_traits> //for std::enable_if
+
+#include <graphblas/base/blas3.hpp>
+#include <graphblas/utils/iterators/MatrixVectorIterator.hpp>
+
+#include "io.hpp"
+#include "matrix.hpp"
+
+#include <omp.h>
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | For all mismatches in the domains of input "       \
+		"parameters and the semiring domains, as specified in the "            \
+		"documentation of the function " y ", supply a container argument of " \
+		"the expected type instead.\n"                                         \
+		"* Possible fix 3 | Provide a compatible semiring where all domains "  \
+		"match those of the container arguments, as specified in the "         \
+		"documentation of the function " y ".\n"                               \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			bool allow_void,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename RIT,
+			typename CIT,
+			typename NIT,
+			class Operator,
+			class Monoid
+		>
+		RC mxm_generic(
+			Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+			const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+			const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+			const Operator &oper,
+			const Monoid &monoid,
+			const MulMonoid &mulMonoid,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value && !
+				grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value &&
+				grb::is_monoid< Monoid >::value,
+			void >::type * const = nullptr
+		) {
+			// nonblocking execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return mxm_generic<
+					allow_void, descr, MulMonoid, OutputType,
+					InputType1, InputType2, RIT, CIT, NIT, Operator, Monoid
+				>(
+					getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+					oper, monoid, mulMonoid, phase
+				);
+		}
+
+	} // end namespace grb::internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		class Semiring
+	>
+	RC mxm(
+		Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+		const Semiring &ring = Semiring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Semiring >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D1, InputType1 >::value
+			), "grb::mxm",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D2, InputType2 >::value ), "grb::mxm",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Semiring::D4, OutputType >::value
+			), "grb::mxm",
+			"called with an output matrix C that does not match the output domain "
+			"of the given operator" );
+
+#ifdef _DEBUG
+		std::cout << "In grb::mxm (nonblocking, unmasked, semiring)\n";
+#endif
+
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: mxm (nonblocking, unmasked, semiring) currently "
+				<< "delegates to a blocking implementation\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		return internal::mxm_generic< true, descr >(
+			C, A, B,
+			ring.getMultiplicativeOperator(),
+			ring.getAdditiveMonoid(),
+			ring.getMultiplicativeMonoid(),
+			phase
+		);
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		class Operator,
+		class Monoid
+	>
+	RC mxm(
+		Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+		const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+		const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+		const Monoid &addM,
+		const Operator &mulOp,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value &&
+			grb::is_monoid< Monoid >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D1, InputType1 >::value
+			), "grb::mxm",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D2, InputType2 >::value
+			), "grb::mxm",
+			"called with a postfactor input matrix B that does not match the first "
+			"domain of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Operator::D3, OutputType >::value ),
+			"grb::mxm",
+			"called with an output matrix C that does not match the output domain "
+			"of the given multiplication operator" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D1, typename Operator::D3 >::value
+			), "grb::mxm",
+			"the output domain of the multiplication operator does not match the "
+			"first domain of the given addition monoid" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D2, OutputType >::value
+			), "grb::mxm",
+			"the second domain of the given addition monoid does not match the "
+			"type of the output matrix C" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Monoid::D3, OutputType >::value
+			), "grb::mxm",
+			"the output type of the given addition monoid does not match the type "
+			"of the output matrix C" );
+		static_assert( ( !(
+				std::is_same< InputType1, void >::value ||
+				std::is_same< InputType2, void >::value
+			) ),
+			"grb::mxm: the operator-monoid version of mxm cannot be used if either "
+			"of the input matrices is a pattern matrix (of type void)" );
+
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: mxm (nonblocking, unmasked, monoid-op) currently "
+				<< "delegates to a blocking implementation\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		return internal::mxm_generic< false, descr >(
+			C, A, B, mulOp, addM, Monoid(), phase
+		);
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool matrix_is_void,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords
+		>
+		RC matrix_zip_generic(
+			Matrix< OutputType, nonblocking > &A,
+			const Vector< InputType1, nonblocking, Coords > &x,
+			const Vector< InputType2, nonblocking, Coords > &y,
+			const Vector< InputType3, nonblocking, Coords > &z,
+			const Phase &phase
+		) {
+			if( internal::NONBLOCKING::warn_if_not_native &&
+				config::PIPELINE::warn_if_not_native
+			) {
+				std::cerr << "Warning: zip (matrix<-vector<-vector<-vector, nonblocking) "
+					<< "currently delegates to a blocking implementation.\n"
+					<< "         Further similar such warnings will be suppressed.\n";
+				internal::NONBLOCKING::warn_if_not_native = false;
+			}
+
+			// nonblocking execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return matrix_zip_generic<
+					descr, matrix_is_void,
+					OutputType, InputType1, InputType2, InputType3,
+					Coords
+				>(
+					getRefMatrix( A ), getRefVector( x ), getRefVector( y ), getRefVector( z ),
+					phase
+				);
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		typename InputType3,
+		typename Coords
+	>
+	RC zip(
+		Matrix< OutputType, nonblocking > &A,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Vector< InputType3, nonblocking, Coords > &z,
+		const Phase &phase = EXECUTE
+	) {
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType1 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"using non-integral left-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType2 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"using non-integral right-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, InputType3 >::value,
+			"grb::zip (two vectors to matrix) called "
+			"with differing vector nonzero and output matrix domains" );
+
+		const size_t n = grb::size( x );
+		const size_t nz = grb::nnz( x );
+		const RC ret = grb::clear( A );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( n != grb::size( z ) ) {
+			return MISMATCH;
+		}
+		if( nz != grb::nnz( y ) ) {
+			return ILLEGAL;
+		}
+		if( nz != grb::nnz( z ) ) {
+			return ILLEGAL;
+		}
+
+		return internal::matrix_zip_generic< descr, false >( A, x, y, z, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1,
+		typename InputType2,
+		typename Coords
+	>
+	RC zip(
+		Matrix< void, nonblocking > &A,
+		const Vector< InputType1, nonblocking, Coords > &x,
+		const Vector< InputType2, nonblocking, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType1 >::value,
+			"grb::zip (two vectors to void matrix) called using non-integral "
+			"left-hand vector elements" );
+		static_assert( !(descr & descriptors::no_casting) ||
+				std::is_integral< InputType2 >::value,
+			"grb::zip (two vectors to void matrix) called using non-integral "
+			"right-hand vector elements" );
+
+		const size_t n = grb::size( x );
+		const size_t nz = grb::nnz( x );
+		const RC ret = grb::clear( A );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		if( n != grb::size( y ) ) {
+			return MISMATCH;
+		}
+		if( nz != grb::nnz( y ) ) {
+			return ILLEGAL;
+		}
+
+		return internal::matrix_zip_generic< descr, true >( A, x, y, x, phase );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType1,
+		typename InputType2,
+		typename OutputType,
+		typename Coords,
+		class Operator
+	>
+	RC outer(
+		Matrix< OutputType, nonblocking > &A,
+		const Vector< InputType1, nonblocking, Coords > &u,
+		const Vector< InputType2, nonblocking, Coords > &v,
+		const Operator &mul = Operator(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			grb::is_operator< Operator >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				!grb::is_object< OutputType >::value,
+			void
+		>::type * const = nullptr
+	) {
+		if( internal::NONBLOCKING::warn_if_not_native &&
+			config::PIPELINE::warn_if_not_native
+		) {
+			std::cerr << "Warning: outer (nonblocking) currently delegates to a "
+				<< "blocking implementation.\n"
+				<< "         Further similar such warnings will be suppressed.\n";
+			internal::NONBLOCKING::warn_if_not_native = false;
+		}
+
+		// nonblocking execution is not supported
+		// first, execute any computation that is not completed
+		internal::le.execution();
+
+		// second, delegate to the reference backend
+		return outer<
+				descr, InputType1, InputType2, OutputType, Coords, Operator
+			>(
+				internal::getRefMatrix( A ),
+				internal::getRefVector( u ), internal::getRefVector( v ),
+				mul, phase
+			);
+	}
+
+	namespace internal {
+
+		template<
+			bool allow_void,
+			Descriptor descr,
+			class MulMonoid,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			class Operator
+		>
+		RC eWiseApply_matrix_generic(
+			Matrix< OutputType, nonblocking > &C,
+			const Matrix< InputType1, nonblocking > &A,
+			const Matrix< InputType2, nonblocking > &B,
+			const Operator &oper,
+			const MulMonoid &mulMonoid,
+			const Phase &phase,
+			const typename std::enable_if<
+				!grb::is_object< OutputType >::value &&
+				!grb::is_object< InputType1 >::value &&
+				!grb::is_object< InputType2 >::value &&
+				grb::is_operator< Operator >::value,
+			void >::type * const = nullptr
+		) {
+			if( internal::NONBLOCKING::warn_if_not_native &&
+				config::PIPELINE::warn_if_not_native
+			) {
+				std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
+					<< "blocking implementation.\n"
+					<< "         Further similar such warnings will be suppressed.\n";
+				internal::NONBLOCKING::warn_if_not_native = false;
+			}
+
+			// nonblocking execution is not supported
+			// first, execute any computation that is not completed
+			le.execution();
+
+			// second, delegate to the reference backend
+			return eWiseApply_matrix_generic<
+					allow_void, descr, MulMonoid, OutputType, InputType1, InputType2, Operator
+				>(
+					getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+					oper, mulMonoid, phase
+				);
+		}
+
+	} // namespace internal
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		class MulMonoid
+	>
+	RC eWiseApply(
+		Matrix< OutputType, nonblocking > &C,
+		const Matrix< InputType1, nonblocking > &A,
+		const Matrix< InputType2, nonblocking > &B,
+		const MulMonoid &mulmono,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_monoid< MulMonoid >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+			"called with a postfactor input matrix B that does not match the "
+			"second domain of the monoid operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename MulMonoid::D3, OutputType >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+			"called with an output matrix C that does not match the output domain "
+			"of the monoid operator"
+		);
+
+#ifdef _DEBUG
+		std::cout << "In grb::eWiseApply_matrix_generic (nonblocking, monoid)\n";
+#endif
+
+		return internal::eWiseApply_matrix_generic< true, descr >(
+			C, A, B, mulmono.getOperator(), mulmono, phase
+		);
+	}
+
+	template<
+		Descriptor descr = grb::descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2,
+		class Operator
+	>
+	RC eWiseApply(
+		Matrix< OutputType, nonblocking > &C,
+		const Matrix< InputType1, nonblocking > &A,
+		const Matrix< InputType2, nonblocking > &B,
+		const Operator &mulOp,
+		const Phase phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< Operator >::value,
+		void >::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D1, InputType1 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with a prefactor input matrix A that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D2, InputType2 >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with a postfactor input matrix B that does not match the first "
+			"domain of the given multiplication operator"
+		);
+		NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+			std::is_same< typename Operator::D3, OutputType >::value ),
+			"grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+			"called with an output matrix C that does not match the output domain "
+			"of the given multiplication operator"
+		);
+		static_assert( ( !(
+				std::is_same< InputType1, void >::value ||
+				std::is_same< InputType2, void >::value )
+			), "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator): "
+			"the operator version of eWiseApply cannot be used if either of the "
+			"input matrices is a pattern matrix (of type void)"
+		);
+
+		typename grb::Monoid<
+			grb::operators::mul< double >,
+			grb::identities::one
+		> dummyMonoid;
+		return internal::eWiseApply_matrix_generic< false, descr >(
+			C, A, B, mulOp, dummyMonoid, phase
+		);
+	}
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // ``_H_GRB_NONBLOCKING_BLAS3''
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp b/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp
new file mode 100644
index 000000000..16fc60a8d
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp
@@ -0,0 +1,1744 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatcher functions for the level-1 primitives.
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS1
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS1
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "vector_wrapper.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_vectorDriven(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_vectorDriven<
+							descr, masked, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_maskDriven(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_maskDriven(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_maskDriven<
+							descr, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_fullLoopSparse(
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool left,
+			class Monoid,
+			typename InputType,
+			typename MaskType,
+			class Coords
+		>
+		RC boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse(
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			typename Monoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_to_fold,
+			const Coords &local_mask,
+			const Vector< InputType, nonblocking, Coords > &to_fold,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Monoid &monoid
+		) {
+			if( already_dense_input_to_fold ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				} else {
+					return internal::fold_from_vector_to_scalar_fullLoopSparse<
+							descr, masked, left, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound,
+							local_to_fold, local_mask, to_fold, mask, monoid
+						);
+				}
+			}
+		}
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			bool already_dense_output,
+			bool already_dense_mask,
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_scalar_to_vector_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, nonblocking, Coords > &vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		);
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			typename MaskType,
+			typename IOType,
+			typename InputType,
+			typename Coords,
+			class OP
+		>
+		RC boolean_dispatcher_fold_from_scalar_to_vector_generic(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_vector,
+			const Coords * const local_mask_ptr,
+			Vector< IOType, nonblocking, Coords > &vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask,
+			const InputType &scalar,
+			const OP &op,
+			const Phase &phase
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							true, true
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				} else {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							true, false
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				}
+			} else {
+				if( already_dense_mask ) {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							false, true
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				} else {
+					return internal::fold_from_scalar_to_vector_generic<
+							descr, left, sparse, masked, monoid,
+							false, false
+						>(
+							lower_bound, upper_bound, local_vector, local_mask_ptr,
+							vector, mask, scalar, op, phase
+						);
+				}
+			}
+		}
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			bool already_dense_output,
+			bool already_dense_input_to_fold,
+			bool already_dense_mask,
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC fold_from_vector_to_vector_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, nonblocking, Coords > &fold_into,
+			const Vector< MaskType, nonblocking, Coords > * const m,
+			const Vector< IType, nonblocking, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		);
+
+		template< Descriptor descr,
+			bool left,
+			bool sparse,
+			bool masked,
+			bool monoid,
+			typename MaskType,
+			typename IOType,
+			typename IType,
+			typename Coords,
+			class OP
+		>
+		RC boolean_dispatcher_fold_from_vector_to_vector_generic(
+			const bool already_dense_output,
+			const bool already_dense_input_to_fold,
+			const bool already_dense_mask,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_fold_into,
+			const Coords * const local_m_ptr,
+			const Coords &local_to_fold,
+			Vector< IOType, nonblocking, Coords > &fold_into,
+			const Vector< MaskType, nonblocking, Coords > * const m,
+			const Vector< IType, nonblocking, Coords > &to_fold,
+			const OP &op,
+			const Phase phase
+		) {
+			if( already_dense_output ) {
+				if( already_dense_input_to_fold ) {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, true, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, true, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				} else {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, false, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								true, false, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_to_fold ) {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, true, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, true, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				} else {
+					if( already_dense_mask ) {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, false, true
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					} else {
+						return internal::fold_from_vector_to_vector_generic<
+								descr, left, sparse, masked, monoid,
+								false, false, false
+							>(
+								lower_bound, upper_bound, local_fold_into, local_m_ptr,
+								local_to_fold, fold_into, m, to_fold, op, phase
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC dense_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		);
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_dense_apply_generic(
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+			if( already_dense_input_x ) {
+				if( already_dense_input_y ) {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				} else {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				}
+			} else {
+				if( already_dense_input_y ) {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				} else {
+					return internal::dense_apply_generic<
+							left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+							true, true
+						>(
+							lower_bound, upper_bound,
+							local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+						);
+				}
+			}
+		}
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		);
+
+		template<
+			bool masked,
+			bool monoid,
+			bool x_scalar,
+			bool y_scalar,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_sparse_apply_generic(
+			const bool already_dense_mask,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_mask_ptr,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+			const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+			const OP &op
+		) {
+			if( already_dense_mask ) {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, true, true
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, true, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, false, true
+							> (
+								lower_bound, upper_bound,
+								local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								true, false, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, true, true
+							> (
+								lower_bound, upper_bound,
+								local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, true, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, false, true
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					} else {
+						return internal::sparse_apply_generic<
+								masked, monoid, x_scalar, y_scalar, descr, OP,
+								false, false, false
+							> (
+								lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			bool already_dense_mask,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC masked_apply_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			const InputType1 * const left_identity,
+			const InputType2 * const right_identity
+#else
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+#endif
+		);
+
+		template<
+			bool left_scalar,
+			bool right_scalar,
+			bool left_sparse,
+			bool right_sparse,
+			Descriptor descr,
+			class OP,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_apply_generic(
+			const bool already_dense_mask,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_mask,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &mask_vector,
+			const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+			const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+			const OP &op,
+			const InputType1 * const left_identity = nullptr,
+			const InputType2 * const right_identity = nullptr
+		) {
+			if( already_dense_mask ) {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, true, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper, op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, true, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, false, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								true, false, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				}
+			} else {
+				if( already_dense_input_x ) {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, true, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, true, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				} else {
+					if( already_dense_input_y ) {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, false, true
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					} else {
+						return internal::masked_apply_generic<
+								left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+								false, false, false
+							>(
+								lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+								z_vector, mask_vector, x_wrapper, y_wrapper,
+								op, left_identity, right_identity
+							);
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC sparse_eWiseMulAdd_maskDriven(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		);
+
+		template<
+			Descriptor descr,
+			bool a_scalar,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC boolean_dispatcher_sparse_eWiseMulAdd_maskDriven(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const bool already_dense_input_a,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords &local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > &m_vector,
+			const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										true, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+											z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, false, true
+									>( lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::sparse_eWiseMulAdd_maskDriven<
+										descr, a_scalar, x_scalar, y_scalar, y_zero,
+										false, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+			bool already_dense_output,
+			bool already_dense_mask,
+			bool already_dense_input_a,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const m_vector,
+			const Vector< InputType1, nonblocking, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool x_scalar,
+			bool y_scalar,
+			bool y_zero,
+			bool mulSwitched,
+			typename OutputType,
+			typename MaskType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename Coords,
+			class Ring
+		>
+		RC boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven(
+			const bool already_dense_output,
+			const bool already_dense_mask,
+			const bool already_dense_input_a,
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_z,
+			const Coords * const local_m,
+			const Coords &local_a,
+			const Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &z_vector,
+			const Vector< MaskType, nonblocking, Coords > * const m_vector,
+			const Vector< InputType1, nonblocking, Coords > &a_vector,
+			const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+			const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+			const Ring &ring = Ring()
+		) {
+			if( already_dense_output ) {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										true, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, true, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				} else {
+					if( already_dense_input_a ) {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, true, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					} else {
+						if( already_dense_input_x ) {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, true, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, true, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						} else {
+							if( already_dense_input_y ) {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, false, true
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							} else {
+								return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+										descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+										false, false, false, false, false
+									>(
+										lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+										z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+									);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool already_dense_input_x,
+			bool already_dense_input_y,
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC sparse_dot_generic(
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, nonblocking, Coords > &x,
+			const Vector< InputType2, nonblocking, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		);
+
+		template<
+			Descriptor descr,
+			class AddMonoid,
+			class AnyOp,
+			typename InputType1,
+			typename InputType2,
+			typename Coords
+		>
+		RC boolean_dispatcher_sparse_dot_generic(
+			const bool already_dense_input_x,
+			const bool already_dense_input_y,
+			typename AddMonoid::D3 &thread_local_output,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			const Coords &local_x,
+			const Coords &local_y,
+			const Vector< InputType1, nonblocking, Coords > &x,
+			const Vector< InputType2, nonblocking, Coords > &y,
+			const size_t local_nz,
+			const AddMonoid &addMonoid,
+			const AnyOp &anyOp
+		) {
+			if( already_dense_input_x ) {
+				if( already_dense_input_y ) {
+					return internal::sparse_dot_generic<
+							descr, true, true
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				} else {
+					return internal::sparse_dot_generic<
+							descr, true, false
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				}
+			} else {
+				if( already_dense_input_y ) {
+					return internal::sparse_dot_generic<
+							descr, false, true
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				} else {
+					return internal::sparse_dot_generic<
+							descr, false, false
+						>(
+							thread_local_output, lower_bound, upper_bound, local_x, local_y,
+							x, y, local_nz, addMonoid, anyOp
+						);
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp b/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp
new file mode 100644
index 000000000..9897a2b0d
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp
@@ -0,0 +1,190 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the level-2 primitives
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS2
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS2
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+			bool already_dense_destination_vector,
+			bool already_dense_mask_vector,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void vxm_inner_kernel_gather(
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, nonblocking, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, nonblocking, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, nonblocking, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		);
+
+		template<
+			Descriptor descr,
+			bool masked,
+			bool input_masked,
+			bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid,
+			class Multiplication,
+			typename IOType,
+			typename InputType1,
+			typename InputType2,
+			typename InputType3,
+			typename InputType4,
+			typename Coords,
+			typename RowColType,
+			typename NonzeroType
+		>
+		inline void boolean_dispatcher_vxm_inner_kernel_gather(
+			const bool already_dense_destination_vector,
+			const bool already_dense_mask_vector,
+			RC &rc,
+			const size_t lower_bound,
+			Coords &local_destination_vector,
+			const Coords &local_mask_vector,
+			Vector< IOType, nonblocking, Coords > &destination_vector,
+			IOType &destination_element,
+			const size_t &destination_index,
+			const Vector< InputType1, nonblocking, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_range,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, nonblocking, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+			const InputType4 * __restrict__ const &source_mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &src_global_to_local,
+			const std::function< size_t( size_t ) > &dst_local_to_global
+		) {
+			if( already_dense_destination_vector ) {
+				if( already_dense_mask_vector ) {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							true, true
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+								src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				} else {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							true, false
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				}
+			} else {
+				if( already_dense_mask_vector ) {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							false, true
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				} else {
+					return internal::vxm_inner_kernel_gather<
+							descr, masked, input_masked, left_handed, One,
+							false, false
+						>(
+							rc, lower_bound, local_destination_vector, local_mask_vector,
+							destination_vector, destination_element, destination_index,
+							source_vector, source, source_range, matrix, mask_vector, mask,
+							source_mask_vector, source_mask, add, mul,
+							src_local_to_global, src_global_to_local, dst_local_to_global
+						);
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_io.hpp b/include/graphblas/nonblocking/boolean_dispatcher_io.hpp
new file mode 100644
index 000000000..528d2cf4c
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_io.hpp
@@ -0,0 +1,361 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the nonblocking i/o primitives.
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_IO
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_IO
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC masked_set(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &m,
+			const T val
+		);
+
+		template<
+			Descriptor descr,
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_set(
+			const bool loop_over_vector_length,
+			const bool already_dense_mask,
+			const bool mask_is_dense,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &m,
+			const T val
+		) {
+			if( loop_over_vector_length ) {
+				if( already_dense_mask ) {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, true, true, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, true, true, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				} else {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, true, false, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, true, false, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				}
+			} else {
+				if( already_dense_mask ) {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, false, true, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, false, true, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				} else {
+					if( mask_is_dense ) {
+						return internal::masked_set<
+								descr, false, false, true
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					} else {
+						return internal::masked_set<
+								descr, false, false, false
+							>( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+					}
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+			bool already_dense_vectors,
+			bool already_dense_input,
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC set_generic(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< InputType, nonblocking, Coords > &y
+		);
+
+		template< Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC boolean_dispatcher_set_generic(
+			const bool already_dense_vectors,
+			const bool already_dense_input,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< InputType, nonblocking, Coords > &y
+		) {
+			if( already_dense_vectors ) {
+				if( already_dense_input ) {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							true, true
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				} else {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							true, false
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				}
+			} else {
+				if( already_dense_input ) {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							false, true
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				} else {
+					return internal::set_generic<
+							descr, out_is_void, in_is_void, sparse,
+							false, false
+						>( lower_bound, upper_bound, local_x, local_y, x, y );
+				}
+			}
+		}
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC masked_set(
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Vector< InputType, nonblocking, Coords > &y
+		);
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC boolean_dispatcher_masked_set(
+			const bool loop_over_y,
+			const bool already_dense_input_y,
+			const bool already_dense_mask,
+			const bool mask_is_dense,
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Vector< InputType, nonblocking, Coords > &y
+		) {
+			if( loop_over_y ) {
+				if( already_dense_input_y ) {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, true, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				} else {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									true, false, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				}
+			} else {
+				if( already_dense_input_y ) {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, true, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				} else {
+					if( already_dense_mask ) {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, true, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, true, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					} else {
+						if( mask_is_dense ) {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, false, true
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						} else {
+							return internal::masked_set<
+									descr, out_is_void, in_is_void,
+									false, false, false, false
+								>( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+						}
+					}
+				}
+			}
+		}
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/collectives.hpp b/include/graphblas/nonblocking/collectives.hpp
new file mode 100644
index 000000000..b78c6a6e9
--- /dev/null
+++ b/include/graphblas/nonblocking/collectives.hpp
@@ -0,0 +1,91 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collectives implementation for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_COLL
+#define _H_GRB_NONBLOCKING_COLL
+
+#include <type_traits>
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/collectives.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/rc.hpp>
+
+
+namespace grb {
+
+	/** The collectives class is based on that of the reference backend */
+	template<>
+	class collectives< nonblocking > {
+
+		private:
+
+			/** Disallow instantiation of this class. */
+			collectives() {}
+
+
+		public:
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC allreduce( IOType &inout, const Operator op = Operator() ) {
+				return collectives< reference >::allreduce< descr, Operator, IOType >(
+					inout, op );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Operator, typename IOType
+			>
+			static RC reduce(
+				IOType &inout, const size_t root = 0, const Operator op = Operator()
+			) {
+				return collectives< reference >::reduce< descr, Operator, IOType >( inout,
+					root, op );
+			}
+
+			template< typename IOType >
+			static RC broadcast( IOType &inout, const size_t root = 0 ) {
+				return collectives< reference >::broadcast< IOType >( inout, root );
+			}
+
+			template< Descriptor descr = descriptors::no_operation, typename IOType >
+			static RC broadcast(
+				IOType * inout, const size_t size,
+				const size_t root = 0
+			) {
+				return collectives< reference >::broadcast< descr, IOType >( inout, size,
+					root );
+			}
+
+	}; // end class `collectives< nonblocking >'
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_COLL''
+
diff --git a/include/graphblas/nonblocking/config.hpp b/include/graphblas/nonblocking/config.hpp
new file mode 100644
index 000000000..1ea6e4ab3
--- /dev/null
+++ b/include/graphblas/nonblocking/config.hpp
@@ -0,0 +1,205 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Configuration settings for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_CONFIG
+#define _H_GRB_NONBLOCKING_CONFIG
+
+#include <graphblas/base/config.hpp>
+#include <graphblas/reference/config.hpp>
+
+
+namespace grb {
+
+	/**
+	 * \defgroup nonblockingConfig Nonblocking backend configuration
+	 *
+	 * \ingroup config
+	 *
+	 * All configuration parameters for the #grb::nonblocking backend.
+	 *
+	 * @{
+	 */
+
+	namespace config {
+
+		/**
+		 * Configuration parameters relating to the pipeline data structure.
+		 *
+		 * \ingroup nonblockingConfig
+		 */
+		class PIPELINE {
+
+			public:
+
+				/**
+				 * How many independent pipelines any ALP algorithm may concurrently expose.
+				 *
+				 * The number of pipelines could exceed this maximum number. If this
+				 * happens, and if #grb::config::PIPELINE::warn_if_exceeded is configured
+				 * <tt>true</tt>, a warning will be output to the standard error stream.
+				 */
+				static constexpr const size_t max_pipelines = 4;
+
+				/**
+				 * Pipelines are constructed with default space for this many containers.
+				 *
+				 * The default is such that each underlying set used by the pipeline
+				 * representation takes less than one kB space.
+				 *
+				 * Pipelines could exceed this maximum number of containers. If this
+				 * happens, and if #grb::config::PIPELINE::warn_if_exceeded is configured
+				 * <tt>true</tt>, a warning will be output to the standard error stream.
+				 */
+				static constexpr const size_t max_containers = 16;
+
+				/**
+				 * Pipelines are constructed with default space for this many stages.
+				 *
+				 * Pipelines could exceed this number of stages. If this happens, and if
+				 * #grb::config::PIPELINE::warn_if_exceeded is configured <tt>true</tt>, a
+				 * warning will be output to the standard error stream.
+				 */
+				static constexpr const size_t max_depth = 16;
+
+				/**
+				 * Pipelines are constructed with default space for this many tiles.
+				 *
+				 * Pipelines could exceed this number of tiles. If this happens, and if
+				 *
+				 * #grb::config::PIPELINE::warn_if_exceeded is configured <tt>true</tt>, a
+				 * warning will be output to the standard error stream.
+				 */
+				static constexpr const size_t max_tiles = 1 << 16;
+
+				/**
+				 * Emit a warning to standard error stream if the default pipeline
+				 * capacities are exceeded.
+				 */
+				static constexpr const bool warn_if_exceeded = true;
+
+				/**
+				 * When <tt>true</tt>, calling a fake nonblocking primitive for a first time
+				 * will emit a warning to the standard error stream.
+				 */
+				static constexpr const bool warn_if_not_native = true;
+
+		};
+
+		/**
+		 * Configuration parameters relating to the analytic model employed by the
+		 * nonblocking backend.
+		 *
+		 * \ingroup nonblockingConfig
+		 */
+		class ANALYTIC_MODEL {
+
+			public:
+
+				/**
+				 * The minimum tile size that may be automatically selected by the analytic
+				 * model.
+				 *
+				 * A tile size that is set manually may be smaller than MIN_TILE_SIZE.
+				 */
+				static constexpr const size_t MIN_TILE_SIZE = 512;
+
+				/**
+				 * The L1 cache size is assumed to be a bit smaller than the actual size to
+				 * take into account any data that may be stored in cache and are not
+				 * considered by the analytic model, e.g., matrices for the current design.
+				 */
+				static constexpr const double L1_CACHE_USAGE_PERCENTAGE = 0.98;
+
+		};
+
+		/**
+		 * Implementation-dependent configuration parameters for the \a nonblocking
+		 * backend.
+		 *
+		 * \note The user documentation only specifies the fields that under some
+		 *       circumstances may benefit from a user adapting it. For viewing all
+		 *       fields, please see the developer documentation.
+		 *
+		 * \note Adapting the fields should be done with care and may require
+		 *       re-compilation and re-installation of the ALP framework.
+		 *
+		 * \ingroup nonblockingConfig
+		 *
+		 * @see grb::config::IMPLEMENTATION
+		 */
+		template<>
+		class IMPLEMENTATION< nonblocking > {
+
+			public:
+
+				/**
+				 * A private memory segment shall never be accessed by threads other than
+				 * the thread who allocates it. Therefore we choose aligned mode here.
+				 */
+				static constexpr ALLOC_MODE defaultAllocMode() {
+					return ALLOC_MODE::ALIGNED;
+				}
+
+				/**
+				 * For the nonblocking backend, a shared memory-segment should use
+				 * interleaved alloc so that any thread has uniform access on average.
+				 */
+				static constexpr ALLOC_MODE sharedAllocMode() {
+					return ALLOC_MODE::INTERLEAVED;
+				}
+
+				/**
+				 * \internal
+				 * By default, use the coordinates of the selected backend.
+				 *
+				 * \note This is an extension that may, at some later stage, be used for
+				 *       composability with the #grb::bsp1d and #grb::hybrid backends.
+				 * \endinternal
+				 */
+				static constexpr Backend coordinatesBackend() {
+					return nonblocking;
+				}
+
+				/**
+				 * \internal
+				 * Whether the backend has vector capacities always fixed to their
+				 * defaults.
+				 * \endinternal
+				 */
+				static constexpr bool fixedVectorCapacities() {
+					return true;
+				}
+
+		};
+
+	} // namespace config
+
+	/** @} */
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_CONFIG''
+
diff --git a/include/graphblas/nonblocking/coordinates.hpp b/include/graphblas/nonblocking/coordinates.hpp
new file mode 100644
index 000000000..bcb4cf42a
--- /dev/null
+++ b/include/graphblas/nonblocking/coordinates.hpp
@@ -0,0 +1,701 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Coordinates for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_COORDINATES
+#define _H_GRB_NONBLOCKING_COORDINATES
+
+#include <stdexcept> //std::runtime_error
+#include <vector>
+#if defined _DEBUG && ! defined NDEBUG
+ #include <set>
+#endif
+
+#include <stddef.h> //size_t
+#include <assert.h>
+
+#include <graphblas/rc.hpp>
+#include <graphblas/backends.hpp>
+#include <graphblas/descriptors.hpp>
+
+#include <graphblas/utils.hpp>
+
+#include <graphblas/base/coordinates.hpp>
+
+#include <graphblas/reference/config.hpp>
+
+#include <graphblas/nonblocking/init.hpp>
+#include <graphblas/nonblocking/analytic_model.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * The Coordinates class is based on that of the reference backend.
+		 * A set of new methods is added to handle local coordinates used
+		 * by the nonblocking backend. The bufferSize method used by the
+		 * Matrix class relies on parbufSize and prefixbufSize that have
+		 * their own implementation for the nonblocking backend.
+		 */
+		template<>
+		class Coordinates< nonblocking > {
+
+			public:
+
+				typedef typename config::VectorIndexType StackType;
+
+				typedef bool ArrayType;
+
+
+			private:
+
+				bool * __restrict__ _assigned;
+
+				StackType * __restrict__ _stack;
+
+				StackType * __restrict__ _buffer;
+
+				size_t _n;
+
+				size_t _cap;
+
+				size_t _buf;
+
+				// pointers to the data of the local coordinates mechanism
+				std::vector< config::VectorIndexType * > local_buffer;
+				config::VectorIndexType * __restrict__ local_new_nnzs;
+				config::VectorIndexType * __restrict__ pref_sum;
+
+				// the analytic model used during the execution of a pipeline
+				AnalyticModel analytic_model;
+
+
+			public:
+
+				static inline size_t arraySize( const size_t dim ) noexcept {
+					if( dim == 0 ) {
+						return 0;
+					}
+					return ( dim + 1 ) * sizeof( ArrayType );
+				}
+
+				static inline size_t stackSize( const size_t dim ) noexcept {
+					if( dim == 0 ) {
+						return 0;
+					}
+					return ( dim + 1 ) * sizeof( StackType );
+				}
+
+				static inline size_t prefixbufSize() noexcept {
+					int P = 1;
+					return ( P + 1 ) * sizeof( StackType );
+				}
+
+				static inline size_t parbufSize( const size_t n ) noexcept {
+					return internal::NONBLOCKING::vectorBufferSize( n ) * sizeof( StackType );
+				}
+
+				static inline size_t bufferSize( const size_t dim ) noexcept {
+					size_t ret = stackSize( dim );
+					ret += parbufSize( dim );
+					ret += prefixbufSize();
+					return ret;
+				}
+
+				inline Coordinates() noexcept :
+					_assigned( nullptr ), _stack( nullptr ), _buffer( nullptr ),
+					_n( 0 ), _cap( 0 ), _buf( 0 )
+				{}
+
+				inline Coordinates( Coordinates< nonblocking > &&x ) noexcept :
+					_assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+					_n( x._n ), _cap( x._cap ), _buf( x._buf )
+				{
+					x._assigned = nullptr;
+					x._stack = nullptr;
+					x._buffer = nullptr;
+					x._n = x._cap = x._buf = 0;
+				}
+
+				inline Coordinates( const Coordinates< nonblocking > &x ) noexcept :
+					_assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+					_n( x._n ), _cap( x._cap ), _buf( x._buf )
+				{
+					assert( this != &x );
+				}
+
+				inline Coordinates< nonblocking > & operator=(
+					const Coordinates< nonblocking > &other
+				) {
+					Coordinates replace( other );
+					*this = std::move( replace );
+					return *this;
+				}
+
+				inline Coordinates< nonblocking > & operator=(
+					Coordinates< nonblocking > &&x
+				) noexcept {
+					assert( this != &x );
+					_assigned = x._assigned;
+					_stack = x._stack;
+					_buffer = x._buffer;
+					_n = x._n;
+					_cap = x._cap;
+					_buf = x._buf;
+					x._assigned = nullptr;
+					x._stack = x._buffer = nullptr;
+					x._n = x._cap = x._buf = 0;
+					return *this;
+				}
+
+				inline ~Coordinates() noexcept {
+					// done (the #_assigned and #_stack memory
+					// blocks are not managed by this class)
+				}
+
+				void set(
+					void * const arr, bool arr_initialized,
+					void * const buf, const size_t dim, bool parallel = true
+				) noexcept {
+					// catch trivial case
+					if( arr == nullptr || buf == nullptr ) {
+						assert( arr == nullptr );
+						assert( buf == nullptr );
+						assert( dim == 0 );
+						_assigned = nullptr;
+						_stack = nullptr;
+						_buffer = nullptr;
+						_n = 0;
+						_cap = 0;
+						_buf = 0;
+						return;
+					}
+
+					// _assigned has no alignment issues, take directly from input buffer
+					assert( reinterpret_cast< uintptr_t >( _assigned ) % sizeof( bool ) == 0 );
+					_assigned = static_cast< bool * >( arr );
+					// ...but _stack does have potential alignment issues:
+					char * buf_raw = static_cast< char * >( buf );
+					constexpr const size_t size = sizeof( StackType );
+					const size_t mod = reinterpret_cast< uintptr_t >( buf_raw ) % size;
+					if( mod != 0 ) {
+						buf_raw += size - mod;
+					}
+					_stack = reinterpret_cast< StackType * >( buf_raw );
+					// no alignment issues between stack and buffer, so just shift by dim:
+					_buffer = _stack + dim;
+					// initialise
+					_n = 0;
+					_cap = dim;
+					_buf = internal::NONBLOCKING::vectorBufferSize( _cap );
+
+					// and initialise _assigned (but only if necessary)
+					if( dim > 0 && !arr_initialized ) {
+						if( parallel ) {
+							#pragma omp parallel
+							{
+								size_t start, end;
+								config::OMP::localRange( start, end, 0, dim );
+								for( size_t i = start; i < end; ++i ) {
+									_assigned[ i ] = false;
+								}
+							}
+						} else {
+							for( size_t i = 0; i < dim; ++i ) {
+								_assigned[ i ] = false;
+							}
+						}
+					}
+				}
+
+				inline bool assign( const size_t i ) noexcept {
+					if( _n == _cap ) {
+						return true;
+					}
+					if( !_assigned[ i ] ) {
+						_assigned[ i ] = true;
+						const size_t newSize = _n + 1;
+						assert( _n <= _cap );
+						assert( newSize <= _cap );
+						_stack[ _n ] = i;
+						_n = newSize;
+						return false;
+					} else {
+						return true;
+					}
+				}
+
+				template< bool maybe_invalid = false >
+				inline void local_assignAll( ) noexcept {
+					if( maybe_invalid || _n != _cap ) {
+						if( _assigned != nullptr ) {
+							assert( _stack != nullptr );
+							assert( maybe_invalid || _n < _cap );
+							assert( !maybe_invalid || _n <= _cap );
+							_n = _cap;
+
+							for( size_t i = 0; i < _n; ++i ) {
+								_assigned[ i ] = true;
+								_stack[ i ] = i;
+							}
+						}
+					}
+
+					// the counter of initial nonzeroes in the local stack is stored in the
+					// buffer immediately before the local stack
+					StackType * __restrict__ local_nnzs = _stack - 1;
+
+					// the counter for the local stack must be set to zero such that the number
+					// of new nonzeroes will be set to _n by asyncJoinSubset and joinSubset
+					// will update the global stack based on the local_new_nnzs counter the
+					// global stack has become empty and _assigned = false so the local
+					// coordinates of this tile must be added in the global stack from scratch
+					// regardless whether this tile was already dense or not as it is hard to
+					// know which part of the global stack contains the coordinates of this
+					// tile
+					*local_nnzs = 0;
+				}
+
+				template< bool maybe_invalid = false >
+				inline void local_assignAllNotAlreadyAssigned( ) noexcept {
+					if( maybe_invalid || _n != _cap ) {
+						if( _assigned != nullptr ) {
+							assert( _stack != nullptr );
+							assert( maybe_invalid || _n < _cap );
+							assert( !maybe_invalid || _n <= _cap );
+
+							// searching for the not already assigned elements and add them to the
+							// local stack such that joinSubset will add to the global stack only
+							// those elements that are not already assigned
+							for( size_t i = 0; i < _cap; ++i ) {
+								if( !_assigned[ i ] ) {
+									_assigned[ i ] = true;
+									_stack[ _n++ ] = i;
+								}
+							}
+
+							assert( _n == _cap );
+						}
+					}
+				}
+
+				inline void clear() noexcept {
+
+					if( _n == _cap ) {
+#ifndef NDEBUG
+						if( _assigned == nullptr && _cap > 0 ) {
+							const bool dense_coordinates_may_not_call_clear = false;
+							assert( dense_coordinates_may_not_call_clear );
+						}
+#endif
+
+						#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+						for( size_t i = 0; i < _cap; ++i ) {
+							_assigned[ i ] = false;
+						}
+					} else {
+						if( _n < config::OMP::minLoopSize() ) {
+							for( size_t k = 0; k < _n; ++k ) {
+								_assigned[ _stack[ k ] ] = false;
+							}
+						} else {
+							#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+							for( size_t k = 0; k < _n; ++k ) {
+								_assigned[ _stack[ k ] ] = false;
+							}
+						}
+					}
+					_n = 0;
+				}
+
+				inline void local_clear() noexcept {
+
+					if( _n == _cap ) {
+#ifndef NDEBUG
+						if( _assigned == nullptr && _cap > 0 ) {
+							const bool dense_coordinates_may_not_call_clear = false;
+							assert( dense_coordinates_may_not_call_clear );
+						}
+#endif
+
+						for( size_t i = 0; i < _cap; ++i ) {
+							_assigned[ i ] = false;
+						}
+					} else {
+						for( size_t k = 0; k < _n; ++k ) {
+							_assigned[ _stack[ k ] ] = false;
+						}
+					}
+					_n = 0;
+
+					// the counter of initial nonzeroes in the local stack is stored in the
+					// buffer immediately before the local stack
+					StackType * __restrict__ local_nnzs = _stack - 1;
+
+					// the counter for the local stack must be set to zero such that any new
+					// assigned element will be written to the global stack
+					*local_nnzs = 0;
+				}
+
+				inline void reset_global_nnz_counter() noexcept {
+					_n = 0;
+				}
+
+				inline bool isEmpty() const noexcept {
+					if( _n == 0 ) {
+						return true;
+					} else {
+						return false;
+					}
+				}
+
+				inline bool isDense() const noexcept {
+					return _n == _cap;
+				}
+
+				inline size_t size() const noexcept {
+					return _cap;
+				}
+
+				inline bool assigned( const size_t i ) const noexcept {
+					assert( i < _cap );
+					return _n == _cap || _assigned[ i ];
+				}
+
+				template< Descriptor descr, typename T >
+				inline bool mask( const size_t i, const T * const val ) const noexcept {
+					assert( i < _cap );
+					return utils::interpretMask< descr >( assigned( i ), val, i );
+				}
+
+				inline size_t nonzeroes() const noexcept {
+					assert( _n <= _cap );
+					return _n;
+				}
+
+				inline size_t index( const size_t k ) const noexcept {
+					assert( k < _n );
+					return isDense() ? k : _stack[ k ];
+				}
+
+				void localCoordinatesInit( const AnalyticModel &am ) {
+
+					analytic_model = am;
+
+					const size_t nthreads = analytic_model.getNumThreads();
+					const size_t tile_size = analytic_model.getTileSize();
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					assert( num_tiles > 0 );
+					assert( num_tiles <= internal::NONBLOCKING::maxBufferTiles( _cap ) );
+					assert( _buf >= 4 * num_tiles );
+
+					local_buffer.resize( analytic_model.getNumTiles() );
+
+					#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+					for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+						local_buffer[ tile_id ] = _buffer + tile_id * ( tile_size + 1 );
+					}
+
+					local_new_nnzs = _buffer + num_tiles * ( tile_size + 1 );
+					pref_sum = _buffer + num_tiles * ( tile_size + 2 );
+				}
+
+				/**
+				 * Initialises a Coordinate instance that refers to a subset of this
+				 * coordinates instance. Multiple disjoint subsets may be retrieved
+				 * and concurrently updated, up to a maximum of tiles given by
+				 *   #internal::NONBLOCKING::maxBufferTiles().
+				 *
+				 * Subsets must be contiguous. If one thread calls this function, all
+				 * other threads must make a matching call.
+				 *
+				 * @param[in] lower_bound     The start index of the contiguous subset
+				 *                            (inclusive).
+				 * @param[in] upper_bound     The end index of the contiguous subset
+				 *                            (exclusive).
+				 */
+				void asyncSubsetInit(
+					const size_t lower_bound,
+					const size_t upper_bound
+				) noexcept {
+					if( _cap == 0 ) {
+						return;
+					}
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					*local_nnzs = 0;
+					if( upper_bound - lower_bound < _n ) {
+						for( size_t i = lower_bound; i < upper_bound; ++i ) {
+							if( _assigned[ i ] ) {
+								local_stack[ (*local_nnzs)++ ] = i - lower_bound;
+							}
+						}
+					} else {
+						for( size_t i = 0; i < _n; ++i ) {
+							const size_t k = _stack[ i ];
+							if( lower_bound <= k && k < upper_bound ) {
+								assert( _assigned[ k ] );
+								local_stack[ (*local_nnzs)++ ] = k - lower_bound;
+							}
+						}
+					}
+
+					// the number of new nonzeroes is initialized here
+					local_new_nnzs[ tile_id ] = 0;
+				}
+
+				/**
+				 * Retrieves a subset coordinate instance that was previously initialised
+				 * using a call to #asyncSubsetInit.
+				 *
+				 * @returns A Coordinates instance that only supports sequential
+				 *          (synchronous) updates as well as all queries.
+				 */
+				Coordinates< nonblocking > asyncSubset(
+					const size_t lower_bound, const size_t upper_bound
+				) const noexcept {
+					assert(_cap > 0);
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					Coordinates< nonblocking > ret;
+					assert( upper_bound - lower_bound <= analytic_model.getTileSize() );
+
+					ret.set( _assigned + lower_bound, true, local_stack,
+						upper_bound - lower_bound, false );
+
+					// the number of new nonzeroes is used to determine the total number
+					// of nonzeroes for the given local coordinates, since some of the
+					// nonzeroes are already written on the local statck
+					ret._n = (*local_nnzs) + local_new_nnzs[ tile_id ];
+					assert( ret._n <= ret._cap );
+
+					ret._buf = 0;
+
+					return ret;
+				}
+
+				/**
+				 * Saves the state of a subset Coordinates instance. Can be retrieved later
+				 * once again via a call to #asyncSubset. New nonzeroes will be committed
+				 * to the global coordinate structure via a call to #joinSubset, which will
+				 * furthermore set the related tile to inactive.
+				 */
+				void asyncJoinSubset(
+					const Coordinates< nonblocking > &subset,
+					const size_t lower_bound, const size_t upper_bound
+				) {
+					assert( _cap > 0 );
+
+					(void) upper_bound;
+
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+
+					assert( subset._n <= subset._cap );
+					assert( (*local_nnzs) <= subset._cap );
+
+					local_new_nnzs[ tile_id ] = subset._n - (*local_nnzs);
+				}
+
+				bool newNonZeroes() const {
+
+					if( _cap == 0 ) {
+						return false;
+					}
+
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					for( size_t i = 0; i < num_tiles; i++ ) {
+						if( local_new_nnzs[ i ] > 0 ) {
+							return true;
+						}
+					}
+					return false;
+				}
+
+				void prefixSumComputation() {
+
+					const size_t num_tiles = analytic_model.getNumTiles();
+
+					// takes into accout the size of data for each iteration of the prefix sum
+					// computation which is used to determine the number of parallel task that
+					// should be used such that the data of each parallel task fit in the L1
+					// cache
+					constexpr size_t size_of_data = sizeof( pref_sum[0] ) +
+						sizeof( local_new_nnzs[0] );
+
+					// make use of the analytic model to estimate a proper number of threads
+					// and a tile size
+					AnalyticModel am( size_of_data, num_tiles, 1 );
+
+					const size_t nthreads = am.getNumThreads();
+					const size_t prefix_sum_tile_size = am.getTileSize();
+					const size_t prefix_sum_num_tiles = am.getNumTiles();
+
+					// make a run-time decision to choose between sequential and parallel
+					// prefix sum implementation the sequential prefix sum implementation is
+					// more efficient for a small number of tiles
+					if( num_tiles < prefix_sum_tile_size ) {
+						// sequential computation of the prefix sum
+						pref_sum[ 0 ] = _n + local_new_nnzs[ 0 ];
+						for( size_t i = 1; i < num_tiles; i++ ) {
+							pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+						}
+					} else {
+						// parallel computation of the prefix sum
+						size_t local_prefix_sum[ prefix_sum_num_tiles ];
+
+						#pragma omp parallel num_threads(nthreads)
+						{
+							#pragma omp for
+							for( size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+								size_t lower, upper;
+								config::OMP::localRange( lower, upper, 0, num_tiles,
+									prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+								// the number of threads used for parallel computation must not exceed
+								// num_tiles, otherwise the code below results in data races
+								assert( id <= num_tiles );
+								assert( id < prefix_sum_num_tiles - 1 || upper == num_tiles );
+								assert( lower <= upper );
+								assert( upper <= num_tiles );
+
+								pref_sum[ lower ] = local_new_nnzs[ lower ];
+								for( size_t i = lower + 1; i < upper; i++ ) {
+									pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+								}
+
+								// each thread stores the prefix sum of its last element in
+								// local_prefix_sum
+								// the memory location is specified by the identifier of the thread to
+								// avoid data races
+								local_prefix_sum[ id ] = pref_sum[ upper - 1 ];
+							}
+
+							// here, there is an implicit barrier that ensures all threads have
+							// already written the local prefix sum for each parallel task
+
+							// a single threads computes the prefix sum for the last element of each
+							// thread
+							#pragma omp single
+							{
+								for( size_t i = 1; i < prefix_sum_num_tiles; i++ ) {
+									local_prefix_sum[ i ] += local_prefix_sum[ i - 1 ];
+								}
+							}
+
+							#pragma omp for
+							for(size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+								size_t lower, upper;
+								config::OMP::localRange( lower, upper, 0, num_tiles,
+									prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+								// the first thread (id=0) needs to add only the number of nonzeroes(_n)
+								const size_t acc = _n + ( ( id > 0 ) ? local_prefix_sum[ id - 1 ] : 0 );
+								for( size_t i = lower; i < upper; i++ ) {
+									pref_sum[ i ] += acc;
+								}
+							}
+						}
+
+#ifdef _DEBUG
+						// ensures that the parallel implementation computes the same result
+						// with the following sequential implementation
+						size_t seq_offsets[ num_tiles ];
+						seq_offsets[ 0 ] = _n + local_new_nnzs[ 0 ];
+						for( size_t i = 1; i < num_tiles; i++ ) {
+							seq_offsets[ i ] = seq_offsets[ i - 1 ] + local_new_nnzs[ i ];
+						}
+
+						for( size_t i = 0; i < num_tiles; i++ ) {
+							assert( seq_offsets[i] == pref_sum[i] );
+						}
+#endif
+					}
+
+					// a single thread updates the number of nonzeroes
+					// the last element of prefix_sum_ofssets alredy includes
+					// the current number of nonzeroes _n which was added earlier
+					_n = pref_sum[ num_tiles - 1 ];
+				}
+
+				/**
+				 * Takes a currently active subset and commits it to the global storage.
+				 * After completion the given active tile will be marked inactive.
+				 */
+				void joinSubset( const size_t lower_bound, const size_t upper_bound ) {
+					if( _cap == 0 ) {
+						return;
+					}
+#ifdef NDEBUG
+					( void )upper_bound;
+#endif
+					const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+					config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+					config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+					const size_t local_stack_start = *local_nnzs;
+					const size_t local_stack_end = *local_nnzs + local_new_nnzs[ tile_id ];
+					assert( local_stack_start <= local_stack_end );
+
+					size_t pos = pref_sum[ tile_id ] - local_new_nnzs[ tile_id ];
+
+					for( size_t k = local_stack_start; k < local_stack_end; ++k ) {
+						const size_t local_index = local_stack[ k ];
+						const size_t global_index = local_index + lower_bound;
+
+						assert( global_index >= lower_bound );
+						assert( global_index < upper_bound );
+						assert( _assigned[ global_index ] );
+						assert( pos < _cap );
+
+						_stack[ pos++ ] = global_index;
+					}
+
+					local_new_nnzs[ tile_id ] = 0;
+				}
+			};
+
+	} // namespace internal
+
+} // namespace grb
+
+#endif // end `_H_GRB_NONBLOCKING_COORDINATES'
+
diff --git a/include/graphblas/nonblocking/exec.hpp b/include/graphblas/nonblocking/exec.hpp
new file mode 100644
index 000000000..09f679526
--- /dev/null
+++ b/include/graphblas/nonblocking/exec.hpp
@@ -0,0 +1,104 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the launcher for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_EXEC
+#define _H_GRB_NONBLOCKING_EXEC
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/exec.hpp>
+
+#include "init.hpp"
+
+
+namespace grb {
+
+	/** The Launcher class is based on that of the reference backend */
+	template< EXEC_MODE mode >
+	class Launcher< mode, nonblocking > {
+
+		private:
+
+			Launcher< mode, reference > ref;
+
+		public:
+
+			/**
+			 * This implementation only accepts a single user process. It ignores
+			 * \a hostname and \a port.
+			 */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				// ignore hostname and port
+				(void) hostname;
+				(void) port;
+				// sanity checks
+				if( nprocs != 1 ) {
+					throw std::invalid_argument( "Total number of user processes must be "
+						"exactly one when using the nonblocking implementation."
+					);
+				}
+				if( process_id != 0 ) {
+					throw std::invalid_argument( "Process ID must always be zero in the "
+						"nonblocking implementation."
+					);
+				}
+			}
+
+			/** No implementation notes. */
+			~Launcher() {}
+
+			/** exec is based on that of the reference backend */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out, const bool broadcast = false
+			) const {
+				return ref.exec( grb_program, data_in, in_size, data_out, broadcast );
+			}
+
+			/** exec is based on that of the reference backend */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ),
+				const T &data_in, U &data_out,
+				const bool broadcast = false
+			) {
+				return ref.exec( grb_program, data_in, data_out, broadcast );
+			}
+
+			/** finalize is based on that of the reference backend */
+			grb::RC finalize() { return ref.finalize(); }
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_EXEC''
+
diff --git a/include/graphblas/nonblocking/forward.hpp b/include/graphblas/nonblocking/forward.hpp
new file mode 100644
index 000000000..0baeee5be
--- /dev/null
+++ b/include/graphblas/nonblocking/forward.hpp
@@ -0,0 +1,51 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Forward declarations required by the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_FORWARD
+#define _H_GRB_NONBLOCKING_FORWARD
+
+
+namespace grb {
+
+	// The eWiseLambda is a friend of matrix but defined in blas2. Therefore it is
+	// forward-declared and this forward definition file included from both
+	// matrix.hpp and blas2.hpp
+	template<
+		class ActiveDistribution = internal::Distribution< nonblocking >,
+		typename Func, typename DataType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A,
+		const size_t s = 0, const size_t P = 1
+	);
+	// end eWiseLambda declarations
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_FORWARD''
+
diff --git a/include/graphblas/nonblocking/init.hpp b/include/graphblas/nonblocking/init.hpp
new file mode 100644
index 000000000..e01b17e70
--- /dev/null
+++ b/include/graphblas/nonblocking/init.hpp
@@ -0,0 +1,147 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the initialisation and finalisation routines for the nonblocking
+ * backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_INIT
+#define _H_GRB_NONBLOCKING_INIT
+
+#include <graphblas/base/init.hpp>
+#include <graphblas/utils/DMapper.hpp>
+
+
+namespace grb {
+
+	template<>
+	RC init< nonblocking >( const size_t, const size_t, void * const );
+
+	template<>
+	RC finalize< nonblocking >();
+
+	namespace internal {
+
+		/** Internal state of the nonblocking backend. */
+		class NONBLOCKING {
+
+			friend RC init< nonblocking >( const size_t, const size_t, void * const );
+
+			private:
+
+				/**
+				 * Determines whether the tile size is automatically selected by the
+				 * analytic model or whether it is manually selected by the user with the
+				 * environment variable GRB_NONBLOCKING_TILE_SIZE.
+				 */
+				static bool manual_tile_size;
+
+				/**
+				 * The tile size that is manually selected by the user and is initialized in
+				 * init.cpp. This variable is only set when the GRB_NONBLOCKING_TILE_SIZE
+				 * environment variable is defined, and if so, this variable equal its
+				 * content.
+				 */
+				static size_t manual_fixed_tile_size;
+
+				/**
+				 * The maximum number of threads available in the system that may be set
+				 * with the environment variable OMP_NUM_THREADS.
+				 */
+				static size_t num_threads;
+
+
+			public:
+
+				/**
+				 * When <tt>true</tt>, calling a fake nonblocking primitive for a first time
+				 * will emit a warning to the standard error stream.
+				 */
+				static bool warn_if_not_native;
+
+				/**
+				 * The number of individual buffers that a vector should be able to
+				 * concurrently maintain.
+				 *
+				 * @param[in] n The vector size.
+				 *
+				 * @returns The number of individual buffers that should be supported.
+				 */
+				static inline size_t maxBufferTiles( const size_t n ) {
+					return n;
+				}
+
+				/**
+				 * Helper function that computes the effective buffer size for a vector
+				 * of \a n elements by taking into account the space required for storing
+				 * the local stack size, the number of new nonzeroes, and the offset used
+				 * for the prefix-sum algorithm
+				 *
+				 * @param[in] n The size of the vector.
+				 * @param[in] T The maximum number of tiles that need be supported.
+				 *
+				 * @returns The buffer size given the vector size, maximum number of
+				 *          tiles, and the requested configuration.
+				 */
+				static inline size_t vectorBufferSize( const size_t n ) {
+					const size_t T = maxBufferTiles( n );
+					size_t ret = n;
+
+					// +1 for storing the local stack size
+					// +1 for storing the number of new nonzeroes
+					// +1 for storing the offset used for the prefix-sum algorithm
+					ret += 3 * T;
+					ret = std::max( 4 * T, ret );
+
+					return ret;
+				}
+
+				/**
+				 * Whether the tile size is manually set by the user or not.
+				 */
+				static bool isManualTileSize() {
+					return manual_tile_size;
+				}
+
+				/**
+				 * The tile size that is manually selected by the user.
+				 */
+				static size_t manualFixedTileSize() {
+					return manual_fixed_tile_size;
+				}
+
+				/**
+				 * The maximum number of threads available in the system.
+				 */
+				static size_t numThreads() {
+					return num_threads;
+				}
+
+		};
+
+	}
+
+} // namespace grb
+
+#endif //``end _H_GRB_NONBLOCKING_INIT''
+
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
new file mode 100644
index 000000000..44b7f3a4d
--- /dev/null
+++ b/include/graphblas/nonblocking/io.hpp
@@ -0,0 +1,1350 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the I/O primitives for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_IO
+#define _H_GRB_NONBLOCKING_IO
+
+#include <graphblas/base/io.hpp>
+#include <graphblas/vector.hpp>
+#include <graphblas/matrix.hpp>
+
+#include "lazy_evaluation.hpp"
+#include "boolean_dispatcher_io.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value input iterator with element "      \
+		"types that match the output vector element type.\n"                   \
+		"* Possible fix 3 | If applicable, provide an index input iterator "   \
+		"with element types that are integral.\n"                              \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	/**
+	 * \defgroup IO Data Ingestion -- nonblocking backend
+	 * @{
+	 */
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	uintptr_t getID( const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A ) {
+		return getID( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t size( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+		return internal::getCoordinates( x ).size();
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t nrows(
+		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+	) noexcept {
+		return nrows( internal::getRefMatrix( A ) );
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t ncols(
+		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+	) noexcept {
+		return ncols( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t nnz( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+		internal::le.execution( &x );
+		return internal::getCoordinates( x ).nonzeroes();
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	size_t nnz(
+		const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+	) noexcept {
+		return nnz( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	size_t capacity( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+		return internal::getCoordinates( x ).size();
+	}
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t capacity(
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+	) noexcept {
+		return capacity( internal::getRefMatrix( A ) );
+	}
+
+	template< typename DataType, typename Coords >
+	RC clear( Vector< DataType, nonblocking, Coords > &x ) noexcept {
+		internal::le.execution( &x );
+		internal::getCoordinates( x ).clear();
+		return SUCCESS;
+	}
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC clear(
+		Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+	) noexcept {
+		return clear( internal::getRefMatrix( A ) );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	RC resize(
+		Vector< InputType, nonblocking, Coords > &x,
+		const size_t new_nz
+	) noexcept {
+		internal::le.execution( &x );
+#ifdef _DEBUG
+		std::cerr << "In grb::resize (vector, nonblocking)\n";
+#endif
+		// this cannot wait until after the below check, as the spec defines that
+		// anything is OK for an empty vector
+		if( new_nz == 0 ) {
+			return grb::clear( x );
+		}
+
+		// check if we have a mismatch
+		if( new_nz > grb::size( x ) ) {
+#ifdef _DEBUG
+			std::cerr << "\t requested capacity of " << new_nz << ", "
+				<< "expected a value smaller than or equal to "
+				<< size( x ) << "\n";
+#endif
+			return ILLEGAL;
+		}
+
+		// in the nonblocking implementation, vectors are of static size
+		// so this function immediately succeeds. However, all existing contents
+		// must be removed
+		return grb::clear( x );
+	}
+
+	template<
+		typename InputType,
+		typename RIT,
+		typename CIT,
+		typename NIT
+	>
+	RC resize(
+		Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+		const size_t new_nz
+	) noexcept {
+		return resize( internal::getRefMatrix( A ), new_nz );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename T,
+		typename Coords
+	>
+	RC set(
+		Vector< DataType, nonblocking, Coords > &x,
+		const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value &&
+			!grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< DataType, T >::value
+			), "grb::set (Vector, unmasked)",
+			"called with a value type that does not match that of the given vector"
+		);
+
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		// pre-cast value to be copied
+		const DataType toCopy = static_cast< DataType >( val );
+		DataType * const raw = internal::getRaw( x );
+		const size_t n = internal::getCoordinates( x ).size();
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, toCopy, raw] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, val) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				bool already_dense_output = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( x ) );
+				if( !already_dense_output ) {
+#endif
+					Coords local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+						upper_bound );
+
+					local_x.local_assignAllNotAlreadyAssigned();
+					assert( local_x.nonzeroes() == local_x.size() );
+
+					internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			for( size_t i = lower_bound; i < upper_bound; i++ ) {
+				raw[ i ] = internal::template ValueOrIndex<
+						descr, DataType, DataType
+					>::getFromScalar( toCopy, i );
+			}
+
+			return SUCCESS;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_SCALAR,
+				n, sizeof( DataType ), dense_descr, true,
+				&x, nullptr,
+				&internal::getCoordinates( x ), nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr, nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: SET(x, val)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			typename DataType,
+			typename MaskType,
+			typename T,
+			typename Coords
+		>
+		RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_vector_length,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			Vector< DataType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &m,
+			const T val
+		) {
+			// pre-cast value to be copied
+			const DataType toCopy = static_cast< DataType >( val );
+
+			DataType * const raw = internal::getRaw( x );
+			const MaskType * const m_p = internal::getRaw( m );
+
+#ifdef _DEBUG
+			if( loop_over_vector_length ) {
+				std::cout << "\t using loop of size n (the vector length)\n";
+			} else {
+				std::cout << "\t using loop of size nz (the number of nonzeroes in the "
+					<< "vector)\n";
+			}
+#endif
+
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_mask_nz = already_dense_mask
+				? local_n
+				: local_mask.nonzeroes();
+
+			const size_t local_size_n = loop_over_vector_length
+				? local_x.size()
+				: local_mask_nz;
+
+			for( size_t k = 0; k < local_size_n; ++k ) {
+
+				const size_t index = ( ( loop_over_vector_length || already_dense_mask )
+					? k
+					: local_mask.index( k ) ) + lower_bound;
+				assert( index < internal::getCoordinates( x ).size() );
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( m ).template mask< descr >( index, m_p ) ) {
+						continue;
+					}
+				} else {
+					if( !local_mask.template mask< descr >(
+						index - lower_bound, m_p + lower_bound
+					) ) {
+						continue;
+					}
+				}
+				if( !mask_is_dense ) {
+					(void) local_x.assign( index - lower_bound );
+				}
+				raw[ index ] = internal::ValueOrIndex<
+						descr, DataType, DataType
+					>::getFromScalar( toCopy, index );
+			}
+
+			return SUCCESS;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename MaskType,
+		typename T,
+		typename Coords
+	>
+	RC set(
+		Vector< DataType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &m,
+		const T val,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< DataType >::value && !grb::is_object< T >::value,
+		void >::type * const = nullptr
+	) {
+#ifdef _DEBUG
+		std::cout << "In grb::set (vector-to-value, masked)\n";
+#endif
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< DataType, T >::value ), "grb::set (Vector to scalar, masked)",
+			"called with a value type that does not match that of the given "
+			"vector"
+		);
+
+		// catch empty mask
+		if( size( m ) == 0 ) {
+			return set< descr >( x, val, phase );
+		}
+
+		// dynamic sanity checks
+		const size_t sizex = size( x );
+		if( sizex != size( m ) ) {
+			return MISMATCH;
+		}
+
+		// handle trivial resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask);
+
+		// then source is a pattern vector, just copy its pattern
+		internal::Pipeline::stage_type func = [&x, &m, val] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, m, val) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			(void) pipeline;
+
+			Coords local_mask, local_x;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_x_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_mask = true;
+
+			const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			// for out-of-place operations with a mask and a scalar input, whether the
+			// output is dense or not depends on the mask
+			if( !mask_is_dense ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x_nz = local_x.nonzeroes();
+				if( dense_descr && local_x_nz < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( m ) );
+				if( !already_dense_mask ) {
+#else
+				already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+						upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !mask_is_dense ) {
+				local_x.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( x ) );
+					}
+				}
+			}
+
+			const bool loop_over_vector_length = ( descr & descriptors::invert_mask ) ||
+				( 4 * local_mask.nonzeroes() > 3 * local_mask.size() );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_set<
+#else
+			rc = internal::masked_set<
+#endif
+					descr, DataType, MaskType, T, Coords
+				>(
+					loop_over_vector_length,
+					already_dense_mask, mask_is_dense,
+					lower_bound, upper_bound,
+					local_x, local_mask, x, m, val
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_MASKED_SCALAR,
+				sizex, sizeof( DataType ),
+				dense_descr, dense_mask,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&m, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, m, val)" << std::endl;
+#endif
+		return ret;
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename DataType,
+		typename T,
+		typename Coords
+	>
+	RC setElement(
+		Vector< DataType, nonblocking, Coords > &x,
+		const T val,
+		const size_t i,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+				!grb::is_object< DataType >::value &&
+				!grb::is_object< T >::value, void
+			>::type * const = nullptr
+	) {
+		internal::le.execution( &x );
+
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< DataType, T >::value ),
+			"grb::set (Vector, at index)",
+			"called with a value type that does not match that of the given "
+			"vector"
+		);
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		// dynamic sanity checks
+		if( i >= size( x ) ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+			return ILLEGAL;
+		}
+
+		// do set
+		(void)internal::getCoordinates( x ).assign( i );
+		internal::getRaw( x )[ i ] = static_cast< DataType >( val );
+
+#ifdef _DEBUG
+		std::cout << "setElement (nonblocking) set index " << i << " to value "
+			<< internal::getRaw( x )[ i ] << "\n";
+#endif
+		return SUCCESS;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+			bool sparse,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_vectors,
+			bool already_dense_input,
+#endif
+			typename OutputType,
+			typename InputType,
+			typename Coords
+		>
+		RC set_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool already_dense_vectors,
+			bool already_dense_input,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< InputType, nonblocking, Coords > &y
+		) {
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_y_nz = already_dense_input
+				? local_n
+				: local_y.nonzeroes();
+
+			OutputType * __restrict__ const dst = internal::getRaw( x );
+			const InputType * __restrict__ const src = internal::getRaw( y );
+
+			if( sparse ) {
+				if( src == nullptr && dst == nullptr ) {
+					for( size_t i = 0; i < local_y_nz; ++i ) {
+						const size_t index = ( already_dense_input ) ? i : local_y.index( i );
+						if( !already_dense_vectors ) {
+							(void) local_x.assign( index );
+						}
+					}
+				} else {
+#ifndef NDEBUG
+					if( src == nullptr ) {
+						assert( dst == nullptr );
+					}
+#endif
+					for( size_t i = 0; i < local_y_nz; ++i ) {
+						const size_t index = ( ( already_dense_input )
+							? i
+							: local_y.index( i ) ) + lower_bound;
+						if( !already_dense_vectors ) {
+							(void) local_x.assign( index - lower_bound );
+						}
+						if( !out_is_void && !in_is_void ) {
+							dst[ index ] = internal::setIndexOrValue< descr, OutputType >( index,
+								src[ index ] );
+						}
+					}
+				}
+			} else {
+				if( !( src == nullptr && dst == nullptr ) ) {
+#ifndef NDEBUG
+					if( src == nullptr ) {
+						assert( dst == nullptr );
+					}
+#endif
+					for( size_t i = lower_bound; i < upper_bound; ++i ) {
+						if( !out_is_void && !in_is_void ) {
+							dst[ i ] = src[ i ];
+						}
+					}
+				}
+			}
+
+			return SUCCESS;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType,
+		typename Coords
+	>
+	RC set(
+		Vector< OutputType, nonblocking, Coords > &x,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Phase &phase = EXECUTE
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< OutputType, InputType >::value ),
+			"grb::copy (Vector)",
+			"called with vector parameters whose element data types do not match"
+		);
+		constexpr bool out_is_void = std::is_void< OutputType >::value;
+		constexpr bool in_is_void = std::is_void< OutputType >::value;
+		static_assert( !in_is_void || out_is_void,
+			"grb::set (nonblocking, vector <- vector, masked): "
+			"if input is void, then the output must be also" );
+		static_assert( !(descr & descriptors::use_index) || !out_is_void,
+			"grb::set (nonblocking, vector <- vector, masked): "
+			"use_index descriptor cannot be set if output vector is void" );
+
+		//get length
+		const size_t n = internal::getCoordinates( y ).size();
+		// check contract
+		if( n != size( x ) ) {
+			return MISMATCH;
+		}
+		if( n == 0 ) {
+			return SUCCESS;
+		}
+		if( getID( x ) == getID( y ) ) {
+			return ILLEGAL;
+		}
+
+		// on resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// on execute
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr bool dense_descr = descr & descriptors::dense;
+
+		internal::Pipeline::stage_type func = [&x, &y] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_x, local_y;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_y_nz = local_n;
+			bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			(void) pipeline;
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_input = true;
+
+			if( !already_dense_vectors ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_input = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input ) {
+#else
+				already_dense_input = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+					if( local_y_nz < local_n ) {
+						sparse = true;
+					}
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !already_dense_vectors ) {
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+				}
+			}
+
+			if( sparse ) {
+				// this primitive is out-of-place, thus make the output empty
+				if( !already_dense_vectors ) {
+					local_x.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_set_generic<
+#else
+				rc = internal::set_generic<
+#endif
+						descr, out_is_void, in_is_void, true
+					>(
+						already_dense_vectors, already_dense_input,
+						lower_bound, upper_bound,
+						local_x, local_y, x, y
+					);
+			} else {
+				if( !already_dense_vectors ) {
+					local_x.local_assignAll();
+				}
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+				rc = internal::boolean_dispatcher_set_generic<
+#else
+				rc = internal::set_generic<
+#endif
+						descr, out_is_void, in_is_void, false
+					>(
+						already_dense_vectors, already_dense_input,
+						lower_bound, upper_bound,
+						local_x, local_y, x, y
+					);
+			}
+
+			if( !already_dense_vectors ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_VECTOR,
+				n, sizeof( OutputType ), dense_descr, true,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&y, nullptr, nullptr, nullptr,
+				&internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, y)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			Descriptor descr,
+			bool out_is_void,
+			bool in_is_void,
+#ifdef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			typename OutputType,
+			typename MaskType,
+			typename InputType,
+			typename Coords
+		>
+		RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+			bool loop_over_y,
+			bool already_dense_input_y,
+			bool already_dense_mask,
+			bool mask_is_dense,
+#endif
+			const size_t lower_bound,
+			const size_t upper_bound,
+			Coords &local_x,
+			const Coords &local_mask,
+			const Coords &local_y,
+			Vector< OutputType, nonblocking, Coords > &x,
+			const Vector< MaskType, nonblocking, Coords > &mask,
+			const Vector< InputType, nonblocking, Coords > &y
+		) {
+			const size_t local_n = upper_bound - lower_bound;
+			const size_t local_y_nz = already_dense_input_y
+				? local_n
+				: local_y.nonzeroes();
+			const size_t local_mask_nz = already_dense_mask
+				? local_n
+				: local_mask.nonzeroes();
+
+			const size_t n = loop_over_y ? local_y_nz : local_mask_nz;
+
+			for( size_t k = 0; k < n; ++k ) {
+				const size_t i = ( loop_over_y
+						? ( already_dense_input_y ? k : local_y.index( k ) )
+						: ( already_dense_mask ? k : local_mask.index( k ) )
+					) + lower_bound;
+				if( already_dense_mask ) {
+					if( !internal::getCoordinates( mask ).template mask< descr >(
+						i, internal::getRaw( mask )
+					) ) {
+						continue;
+					}
+				} else {
+					if( !local_mask.template mask< descr >(
+						i - lower_bound, internal::getRaw( mask ) + lower_bound
+					) ) {
+						continue;
+					}
+				}
+				if( loop_over_y || already_dense_input_y ||
+					local_y.assigned( i - lower_bound )
+				) {
+					if( !out_is_void && !in_is_void ) {
+						if( !mask_is_dense ) {
+							(void) local_x.assign( i - lower_bound );
+						}
+						internal::getRaw( x )[ i ] = internal::ValueOrIndex<
+								descr, OutputType, InputType
+							>::getFromArray(
+								internal::getRaw( y ),
+								[] (const size_t i) {return i;},
+								i
+							);
+					}
+				}
+			}
+
+			return SUCCESS;
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename MaskType,
+		typename InputType,
+		typename Coords
+	>
+	RC set(
+		Vector< OutputType, nonblocking, Coords > &x,
+		const Vector< MaskType, nonblocking, Coords > &mask,
+		const Vector< InputType, nonblocking, Coords > &y,
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType >::value,
+		void >::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< OutputType, InputType >::value ),
+			"grb::set (Vector)",
+			"called with vector parameters whose element data types do not match" );
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< MaskType, bool >::value ),
+			"grb::set (Vector)",
+			"called with non-bool mask element types" );
+		constexpr bool out_is_void = std::is_void< OutputType >::value;
+		constexpr bool in_is_void = std::is_void< OutputType >::value;
+		static_assert( !in_is_void || out_is_void,
+			"grb::set (nonblocking, vector <- vector, masked): "
+			"if input is void, then the output must be also" );
+		static_assert( !(descr & descriptors::use_index) || !out_is_void,
+			"grb::set (nonblocking, vector <- vector, masked): "
+			"use_index descriptor cannot be set if output vector is void" );
+
+		// catch contract violations
+		const size_t size = grb::size( y );
+		if( size != grb::size( x ) ) {
+			return MISMATCH;
+		}
+		if( size == 0 ) {
+			return SUCCESS;
+		}
+		if( getID( x ) == getID( y ) ) {
+			return ILLEGAL;
+		}
+
+		// delegate if possible
+		if( grb::size( mask ) == 0 ) {
+			return set( x, y );
+		}
+
+		// additional contract check
+		if( size != grb::size( mask ) ) {
+			return MISMATCH;
+		}
+
+		// on resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// on execute
+		assert( phase == EXECUTE );
+
+		RC ret = SUCCESS;
+
+		constexpr const bool dense_descr = descr & descriptors::dense;
+		constexpr const bool dense_mask = dense_descr &&
+			(descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask);
+
+		internal::Pipeline::stage_type func = [&x, &mask, &y] (
+			internal::Pipeline &pipeline,
+			size_t lower_bound, size_t upper_bound
+		) {
+#ifdef _NONBLOCKING_DEBUG
+			#pragma omp critical
+			std::cout << "\t\tExecution of stage set(x, mask, y) in the range("
+				<< lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+			RC rc = SUCCESS;
+
+			Coords local_mask, local_x, local_y;
+			const size_t local_n = upper_bound - lower_bound;
+			size_t local_mask_nz = local_n;
+			size_t local_x_nz = local_n;
+			size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			const bool already_dense_vectors = dense_descr ||
+				pipeline.allAlreadyDenseVectors();
+#else
+			constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+			bool already_dense_mask = true;
+			bool already_dense_input_y = true;
+
+			// make the vector empty unless the dense descriptor is provided
+			constexpr const bool mask_is_dense = (descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask) && already_dense_vectors;
+
+			if( !mask_is_dense ) {
+				local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+					upper_bound );
+				local_x_nz = local_x.nonzeroes();
+				if( dense_descr && local_x_nz < local_n ) {
+					return ILLEGAL;
+				}
+			}
+
+			if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				already_dense_mask = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( mask ) );
+				if( !already_dense_mask ) {
+#else
+					already_dense_mask = false;
+#endif
+					local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+						upper_bound );
+					local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+
+				already_dense_input_y = pipeline.containsAlreadyDenseVector(
+					&internal::getCoordinates( y ) );
+				if( !already_dense_input_y ) {
+#else
+				already_dense_input_y = false;
+#endif
+					local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+						upper_bound );
+					local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				}
+#endif
+			}
+
+			if( !mask_is_dense ) {
+				local_x.local_clear();
+				if( lower_bound == 0 ) {
+					internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+					pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+					if( dense_descr ) {
+						pipeline.markMaybeSparseDenseDescriptorVerification(
+							&internal::getCoordinates( x ) );
+					}
+				}
+			}
+
+			// choose optimal loop size
+			const bool loop_over_y = (descr & descriptors::invert_mask) ||
+				( local_y_nz < local_mask_nz );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+			rc = internal::boolean_dispatcher_masked_set<
+#else
+			rc = internal::masked_set<
+#endif
+					descr, out_is_void, in_is_void
+				>(
+					loop_over_y,
+					already_dense_input_y, already_dense_mask, mask_is_dense,
+					lower_bound, upper_bound,
+					local_x, local_mask, local_y,
+					x, mask, y
+				);
+
+			if( !mask_is_dense ) {
+				internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+					upper_bound );
+			}
+
+			return rc;
+		};
+
+		ret = ret ? ret : internal::le.addStage(
+				std::move( func ),
+				internal::Opcode::IO_SET_MASKED_VECTOR,
+				size, sizeof( OutputType ), dense_descr, dense_mask,
+				&x, nullptr, &internal::getCoordinates( x ), nullptr,
+				&mask, &y, nullptr, nullptr,
+				&internal::getCoordinates( mask ), &internal::getCoordinates( y ),
+				nullptr, nullptr,
+				nullptr
+			);
+
+#ifdef _NONBLOCKING_DEBUG
+		std::cout << "\t\tStage added to a pipeline: set(x, mask, y)" << std::endl;
+#endif
+		return ret;
+	}
+
+	namespace internal {
+
+		template<
+			bool A_is_mask,
+			Descriptor descr,
+			typename OutputType,
+			typename InputType1,
+			typename InputType2 = const OutputType
+		>
+		RC set(
+			Matrix< OutputType, nonblocking > &C,
+			const Matrix< InputType1, nonblocking > &A,
+			const InputType2 * __restrict__ id = nullptr
+		) noexcept {
+			if( internal::NONBLOCKING::warn_if_not_native &&
+				config::PIPELINE::warn_if_not_native
+			) {
+				std::cerr << "Warning: set (matrix copy, nonblocking) currently delegates "
+					<< "to a blocking implementation.\n"
+					<< "         Further similar such warnings will be suppressed.\n";
+				internal::NONBLOCKING::warn_if_not_native = false;
+			}
+
+			// nonblocking execution is not supported
+			// first, execute any computation that is not completed
+			grb::internal::le.execution();
+
+			// second, delegate to the reference backend
+			return set< A_is_mask, descr, OutputType, InputType1, InputType2 >(
+				internal::getRefMatrix( C ), internal::getRefMatrix( A ), id );
+		}
+
+	} // end namespace internal::grb
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType
+	>
+	RC set(
+		Matrix< OutputType, nonblocking > &C,
+		const Matrix< InputType, nonblocking > &A,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		static_assert( std::is_same< OutputType, void >::value ||
+			!std::is_same< InputType, void >::value,
+			"grb::set cannot interpret an input pattern matrix without a "
+			"semiring or a monoid. This interpretation is needed for "
+			"writing the non-pattern matrix output. Possible solutions: 1) "
+			"use a (monoid-based) foldl / foldr, 2) use a masked set, or "
+			"3) change the output of grb::set to a pattern matrix also." );
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-matrix, nonblocking)" << std::endl;
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types" );
+
+		// dynamic checks
+		assert( phase != TRY );
+
+		// delegate
+		if( phase == RESIZE ) {
+			return resize( C, nnz( A ) );
+		} else {
+			assert( phase == EXECUTE );
+			return internal::set< false, descr >( C, A );
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename OutputType,
+		typename InputType1,
+		typename InputType2
+	>
+	RC set(
+		Matrix< OutputType, nonblocking > &C,
+		const Matrix< InputType1, nonblocking > &A,
+		const InputType2 &val,
+		const Phase &phase = EXECUTE
+	) noexcept {
+		static_assert( !std::is_same< OutputType, void >::value,
+			"internal::grb::set (masked set to value): cannot have a pattern "
+			"matrix as output" );
+#ifdef _DEBUG
+		std::cout << "Called grb::set (matrix-to-value-masked, nonblocking)\n";
+#endif
+		// static checks
+		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< InputType2, OutputType >::value
+			), "grb::set",
+			"called with non-matching value types"
+		);
+
+		// dynamic checks
+		assert( phase != TRY );
+
+		// delegate
+		if( phase == RESIZE ) {
+			return resize( C, nnz( A ) );
+		} else {
+			assert( phase == EXECUTE );
+			if( std::is_same< OutputType, void >::value ) {
+				return internal::set< false, descr >( C, A );
+			} else {
+				return internal::set< true, descr >( C, A, &val );
+			}
+		}
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator,
+		typename Coords,
+		class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, nonblocking, Coords > &x,
+		fwd_iterator start,
+		const fwd_iterator end,
+		const IOMode mode,
+		const Dup &dup = Dup()
+	) {
+		return buildVector< descr, InputType, fwd_iterator, Coords, Dup >(
+			internal::getRefVector( x ), start, end, mode, dup );
+	}
+
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename fwd_iterator1,
+		typename fwd_iterator2,
+		typename Coords,
+		class Dup = operators::right_assign< InputType >
+	>
+	RC buildVector(
+		Vector< InputType, nonblocking, Coords > &x,
+		fwd_iterator1 ind_start,
+		const fwd_iterator1 ind_end,
+		fwd_iterator2 val_start,
+		const fwd_iterator2 val_end,
+		const IOMode mode,
+		const Dup &dup = Dup()
+	) {
+		internal::le.execution( &x );
+		return buildVector<
+				descr, InputType, fwd_iterator1, fwd_iterator2, Coords, Dup
+			>(
+				internal::getRefVector( x ), ind_start, ind_end, val_start, val_end,
+				mode, dup
+			);
+	}
+
+	/** buildMatrixUnique is based on that of the reference backend */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		typename InputType,
+		typename RIT,
+		typename CIT,
+		typename NIT,
+		typename fwd_iterator
+	>
+	RC buildMatrixUnique(
+		Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+		fwd_iterator start,
+		const fwd_iterator end,
+		const IOMode mode
+	) {
+		return buildMatrixUnique<
+				descr, InputType, RIT, CIT, NIT, fwd_iterator
+			>( internal::getRefMatrix(A), start, end, mode );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	uintptr_t getID( const Vector< InputType, nonblocking, Coords > &x ) {
+		return getID( internal::getRefVector( x ) );
+	}
+
+	template<>
+	RC wait< nonblocking >();
+
+	/** \internal Dispatch to base wait implementation */
+	template<
+		typename InputType,
+		typename Coords,
+		typename ... Args
+	>
+	RC wait(
+		const Vector< InputType, nonblocking, Coords > &x,
+		const Args &... args
+	) {
+		RC ret = internal::le.execution( &x );
+		if( ret != SUCCESS ) {
+			return ret;
+		}
+		return wait( args... );
+	}
+
+	template<
+		typename InputType,
+		typename Coords
+	>
+	RC wait( const Vector< InputType, nonblocking, Coords > &x ) {
+		return internal::le.execution( &x );
+	}
+
+	/** \internal Dispatch to base wait implementation */
+	template<
+		typename InputType,
+		typename... Args
+	>
+	RC wait(
+		const Matrix< InputType, nonblocking > &A,
+		const Args &... args
+	) {
+		(void) A;
+		//TODO: currently, matrices are read only and no action is required
+		//		once the level-3 primitives are implemented
+		//		the pipeline should be executed like for vectors
+		return wait( args... );
+	}
+
+	template< typename InputType >
+	RC wait( const Matrix< InputType, nonblocking > &A ) {
+		(void) A;
+		//TODO: currently, matrices are read only and no action is required
+		//		once the level-3 primitives are implemented
+		//		the pipeline should be executed like for vectors
+		//return wait( args... );
+		return SUCCESS;
+	}
+
+	/** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_GRB_NONBLOCKING_IO
+
diff --git a/include/graphblas/nonblocking/lazy_evaluation.hpp b/include/graphblas/nonblocking/lazy_evaluation.hpp
new file mode 100644
index 000000000..426f530fb
--- /dev/null
+++ b/include/graphblas/nonblocking/lazy_evaluation.hpp
@@ -0,0 +1,178 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Supporting constructs for lazy evaluation.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_LAZY_EVALUATION
+#define _H_GRB_NONBLOCKING_LAZY_EVALUATION
+
+#include <graphblas/backends.hpp>
+
+#include "coordinates.hpp"
+#include "pipeline.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * Stores ALP primitives as stages in a set of pipelines maintained by this
+		 * class.
+		 */
+		class LazyEvaluation {
+
+			private:
+
+				/** Multiple pipelines may be maintained at any time. */
+				std::vector< Pipeline > pipelines;
+
+				/** Stores the pipelines that share data with the new stage. */
+				std::vector< std::vector< Pipeline >::iterator > shared_data_pipelines;
+
+				/**
+				 * Makes sure any warnings related to exceeding the initial number of
+				 * pipelines are printed only once.
+				 */
+				bool warn_if_exceeded;
+
+				/**
+				 * Checks if the number of pipelines has been exceeded past the initial
+				 * number of pipelines.
+				 *
+				 * The initial number is configurable via the following configuration
+				 * field: #grb::config::PIPELINE::max_pipelines.
+				 */
+				void checkIfExceeded() noexcept;
+
+
+			public:
+
+				/** Default constructor. */
+				LazyEvaluation();
+
+				/**
+				 * Adds a stage to an automatically determined pipeline.
+				 *
+				 * The following parameters are mandatory:
+				 *
+				 * @param[in]  func                     The function to be added.
+				 * @param[in]  opcode                   The corresponding opcode.
+				 * @param[in]  n                        The pipeline size.
+				 * @param[in]  data_type_size           The output byte size.
+				 * @param[in]  dense_descr              Whether the op is dense.
+				 * @param[in]  dense_mask               Whether the mask is dense.
+				 *
+				 * The following parameters are optional and could be <tt>nullptr</tt> if
+				 * not required:
+				 *
+				 * @param[out] output_container_ptr     Pointer to the output container.
+				 * @param[out] output_aux_container_ptr Pointer to another output.
+				 * @param[out] coor_output_ptr          Pointer to the coordinates that
+				 *                                      correspond to
+				 *                                      \a output_container_ptr
+				 * @param[out] coor_output_aux_ptr      Pointer to the coordinates that
+				 *                                      correspond to
+				 *                                      \a output_aux_container_ptr
+				 * @param[in]  input_a_ptr              Pointer to a first input container.
+				 * @param[in]  input_b_ptr              Pointer to a second such container.
+				 * @param[in]  input_c_ptr              Pointer to a third such container.
+				 * @param[in]  input_d_ptr              Pointer to a fourth such container.
+				 * @param[in]  coor_a_ptr               Pointer to coordinates that
+				 *                                      correspond to \a input_a_ptr.
+				 * @param[in]  coor_b_ptr               Pointer to coordinates that
+				 *                                      correspond to \a input_b_ptr.
+				 * @param[in]  coor_c_ptr               Pointer to coordinates that
+				 *                                      correspond to \a input_c_ptr.
+				 * @param[in]  coor_d_ptr               Pointer to coordinates that
+				 *                                      correspond to \a input_d_ptr.
+				 * @param[in]  input_matrix             Pointer to an input matrix.
+				 */
+				RC addStage(
+					const Pipeline::stage_type &&func,
+					const Opcode opcode,
+					const size_t n,
+					const size_t data_type_size,
+					const bool dense_descr,
+					const bool dense_mask,
+					void * const output_container_ptr,
+					void * const output_aux_container_ptr,
+					Coordinates< nonblocking > * const coor_output_ptr,
+					Coordinates< nonblocking > * const coor_output_aux_ptr,
+					const void * const input_a_ptr,
+					const void * const input_b_ptr,
+					const void * const input_c_ptr,
+					const void * const input_d_ptr,
+					const Coordinates< nonblocking > * const coor_a_ptr,
+					const Coordinates< nonblocking > * const coor_b_ptr,
+					const Coordinates< nonblocking > * const coor_c_ptr,
+					const Coordinates< nonblocking > * const coor_d_ptr,
+					const void * const input_matrix
+				);
+
+				/**
+				 * Adds an eWiseLambda stage to an automatically-determined pipeline.
+				 *
+				 * The following parameters are mandatory:
+				 *
+				 * @param[in] func               The function to be added.
+				 * @param[in] opcode             The corresponding opcode.
+				 * @param[in] n                  The pipeline size.
+				 * @param[in] data_type_size     The output byte size.
+				 * @param[in] dense_descr        Whether the op is dense.
+				 * @param[in] all_containers_ptr A container of all ALP containers that the
+				 *                               \a func reads \em or writes
+				 * @param[in] coor_a_ptr         A container of all coordinates that
+				 *                               correspond to those in
+				 *                               \a all_containers_ptr
+				 */
+				RC addeWiseLambdaStage(
+					const Pipeline::stage_type &&func,
+					const Opcode opcode,
+					const size_t n,
+					const size_t data_type_size,
+					const bool dense_descr,
+					std::vector< const void * > all_containers_ptr,
+					const Coordinates< nonblocking > * const coor_a_ptr
+				);
+
+				/**
+				 * Executes the pipeline necessary to generate the output of the given
+				 * \a container.
+				 */
+				RC execution( const void *container );
+
+				/**
+				 * Executes all pipelines.
+				 */
+				RC execution();
+
+		}; // end class LazyEvaluation
+
+	} // end namespace internal
+
+} // end namespace grb
+
+#endif //end `_H_GRB_NONBLOCKING_LAZY_EVALUATION'
+
diff --git a/include/graphblas/nonblocking/matrix.hpp b/include/graphblas/nonblocking/matrix.hpp
new file mode 100644
index 000000000..b13a8c2be
--- /dev/null
+++ b/include/graphblas/nonblocking/matrix.hpp
@@ -0,0 +1,595 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the nonblocking matrix container.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_MATRIX
+#define _H_GRB_NONBLOCKING_MATRIX
+
+#include <sstream> //std::stringstream
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <stdexcept>
+#include <utility>
+#include <iterator>
+#include <cmath>
+
+#include <assert.h>
+
+#include <graphblas/backends.hpp>
+#include <graphblas/base/matrix.hpp>
+#include <graphblas/config.hpp>
+#include <graphblas/utils.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/reference/compressed_storage.hpp>
+#include <graphblas/reference/init.hpp>
+#include <graphblas/type_traits.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+#include <graphblas/utils/DMapper.hpp>
+#include <graphblas/type_traits.hpp>
+
+#include <graphblas/algorithms/hpcg/ndim_matrix_builders.hpp>
+#include <graphblas/utils/iterators/utils.hpp>
+
+#include <graphblas/reference/NonzeroWrapper.hpp>
+
+#include "forward.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			Matrix< DataType, nonblocking, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A ) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		const size_t & getNonzeroCapacity(
+			const grb::Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.cap;
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		const size_t & getCurrentNonzeroes(
+			const grb::Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return A.nz;
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		void setCurrentNonzeroes(
+			grb::Matrix< D, nonblocking, RIT, CIT, NIT > &A,
+			const size_t nnz
+		) noexcept {
+			A.nz = nnz;
+		}
+
+		/**
+		 * \internal
+		 *
+		 * Retrieves internal SPA buffers.
+		 *
+		 * @param[out] coorArr Pointer to the bitmask array
+		 * @param[out] coorBuf Pointer to the stack
+		 * @param[out] valBuf  Pointer to the value buffer
+		 * @param[in]    k     If 0, the row-wise SPA is returned
+		 *                     If 1, the column-wise SPA is returned
+		 *                     Any other value is not allowed
+		 * @param[in]    A     The matrix of which to return the associated SPA
+		 *                     data structures.
+		 *
+		 * @tparam InputType The type of the value buffer.
+		 *
+		 * \endinternal
+		 */
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		void getMatrixBuffers(
+			char * &coorArr, char * &coorBuf, InputType * &valbuf,
+			const unsigned int k,
+			const grb::Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			assert( k < 2 );
+			coorArr = const_cast< char * >( A.coorArr[ k ] );
+			coorBuf = const_cast< char * >( A.coorBuf[ k ] );
+			valbuf = const_cast< InputType * >( A.valbuf[ k ] );
+		}
+
+		template< Descriptor descr,
+			bool input_dense, bool output_dense,
+			bool masked,
+			bool left_handed,
+			template< typename > class One,
+			typename IOType,
+			class AdditiveMonoid, class Multiplication,
+			typename InputType1, typename InputType2, typename InputType3,
+			typename RowColType, typename NonzeroType,
+			typename Coords
+		>
+		void vxm_inner_kernel_scatter(
+			RC &rc,
+			Vector< IOType, nonblocking, Coords > &destination_vector,
+			IOType * __restrict__ const &destination,
+			const size_t &destination_range,
+			const Vector< InputType1, nonblocking, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_index,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, nonblocking, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &dst_global_to_local
+		);
+
+		template<
+			Descriptor descr,
+			bool masked, bool input_masked, bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid, class Multiplication,
+			typename IOType, typename InputType1, typename InputType2,
+			typename InputType3, typename InputType4,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		RC vxm_generic(
+			Vector< IOType, nonblocking, Coords > &u,
+			const Vector< InputType3, nonblocking, Coords > &mask,
+			const Vector< InputType1, nonblocking, Coords > &v,
+			const Vector< InputType4, nonblocking, Coords > &v_mask,
+			const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &row_l2g,
+			const std::function< size_t( size_t ) > &row_g2l,
+			const std::function< size_t( size_t ) > &col_l2g,
+			const std::function< size_t( size_t ) > &col_g2l
+		);
+
+	} // namespace internal
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t nrows(
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t ncols(
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	size_t nnz(
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+	) noexcept;
+
+	template< typename InputType, typename RIT, typename CIT, typename NIT >
+	RC clear( Matrix< InputType, nonblocking, RIT, CIT, NIT > & ) noexcept;
+
+	template< typename DataType, typename RIT, typename CIT, typename NIT >
+	RC resize(
+		Matrix< DataType, nonblocking, RIT, CIT, NIT > &,
+		const size_t
+	) noexcept;
+
+	template<
+		class ActiveDistribution, typename Func, typename DataType,
+		typename RIT, typename CIT, typename NIT
+	>
+	RC eWiseLambda(
+		const Func f,
+		const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A,
+		const size_t s, const size_t P
+	);
+
+	/**
+	 * A GraphBLAS matrix, nonblocking implementation.
+	 *
+	 * Uses Compressed Column Storage (CCS) plus Compressed Row Storage (CRS).
+	 *
+	 * \warning This implementation prefers speed over memory efficiency.
+	 *
+	 * @tparam D The type of a nonzero element.
+	 *
+	 * \internal
+	 * @tparam RowIndexType The type used for row indices
+	 * @tparam ColIndexType The type used for column indices
+	 * @tparam NonzeroIndexType The type used for nonzero indices
+	 * \endinternal
+	 */
+	template<
+		typename D,
+		typename RowIndexType,
+		typename ColIndexType,
+		typename NonzeroIndexType
+	>
+	class Matrix< D, nonblocking, RowIndexType, ColIndexType, NonzeroIndexType > {
+
+		static_assert( !grb::is_object< D >::value,
+			"Cannot create an ALP matrix of ALP objects!" );
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend Matrix< DataType, reference, RIT, CIT, NIT > & internal::getRefMatrix(
+			Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend const Matrix< DataType, reference, RIT, CIT, NIT > &
+		internal::getRefMatrix(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+
+		/* *********************
+		        BLAS2 friends
+		   ********************* */
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t nrows(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t ncols(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		friend size_t nnz(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend RC clear(
+			Matrix< InputType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT  >
+		friend RC resize(
+			Matrix< DataType, nonblocking, RIT, CIT, NIT > &,
+			const size_t
+		) noexcept;
+
+		template<
+			class ActiveDistribution, typename Func, typename DataType,
+			typename RIT, typename CIT, typename NIT
+		>
+		friend RC eWiseLambda(
+			const Func,
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &,
+			const size_t, const size_t
+		);
+
+		template<
+			Descriptor descr,
+			bool input_dense, bool output_dense, bool masked, bool left_handed,
+			template< typename > class One,
+			typename IOType,
+			class AdditiveMonoid, class Multiplication,
+			typename InputType1, typename InputType2,
+			typename InputType3,
+			typename RowColType, typename NonzeroType,
+			typename Coords
+		>
+		friend void internal::vxm_inner_kernel_scatter(
+			RC &rc,
+			Vector< IOType, nonblocking, Coords > &destination_vector,
+			IOType * __restrict__ const &destination,
+			const size_t &destination_range,
+			const Vector< InputType1, nonblocking, Coords > &source_vector,
+			const InputType1 * __restrict__ const &source,
+			const size_t &source_index,
+			const internal::Compressed_Storage<
+				InputType2, RowColType, NonzeroType
+			> &matrix,
+			const Vector< InputType3, nonblocking, Coords > &mask_vector,
+			const InputType3 * __restrict__ const &mask,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &src_local_to_global,
+			const std::function< size_t( size_t ) > &dst_global_to_local
+		);
+
+		template<
+			Descriptor descr,
+			bool masked, bool input_masked, bool left_handed,
+			template< typename > class One,
+			class AdditiveMonoid, class Multiplication,
+			typename IOType, typename InputType1, typename InputType2,
+			typename InputType3, typename InputType4,
+			typename Coords, typename RIT, typename CIT, typename NIT
+		>
+		friend RC internal::vxm_generic(
+			Vector< IOType, nonblocking, Coords > &u,
+			const Vector< InputType3, nonblocking, Coords > &mask,
+			const Vector< InputType1, nonblocking, Coords > &v,
+			const Vector< InputType4, nonblocking, Coords > &v_mask,
+			const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+			const AdditiveMonoid &add,
+			const Multiplication &mul,
+			const std::function< size_t( size_t ) > &row_l2g,
+			const std::function< size_t( size_t ) > &row_g2l,
+			const std::function< size_t( size_t ) > &col_l2g,
+			const std::function< size_t( size_t ) > &col_g2l
+		);
+
+		/* ********************
+		        IO friends
+		   ******************** */
+
+		template<
+			Descriptor descr, typename InputType,
+			typename RIT, typename CIT, typename NIT,
+			typename fwd_iterator
+		>
+		friend RC buildMatrixUnique(
+			Matrix< InputType, nonblocking, RIT, CIT, NIT > &,
+			fwd_iterator, const fwd_iterator,
+			const IOMode
+		);
+
+		friend internal::Compressed_Storage< D, RowIndexType, NonzeroIndexType > &
+		internal::getCRS<>(
+			Matrix<
+				D, nonblocking,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend const internal::Compressed_Storage<
+			D,
+			RowIndexType, NonzeroIndexType
+		> & internal::getCRS<>(
+			const Matrix<
+				D, nonblocking,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend internal::Compressed_Storage< D, ColIndexType, NonzeroIndexType > &
+		internal::getCCS<>(
+			Matrix<
+				D, nonblocking,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		friend const internal::Compressed_Storage<
+			D, ColIndexType, NonzeroIndexType
+		> & internal::getCCS<>(
+			const Matrix<
+				D, nonblocking,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> &A
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const size_t & internal::getNonzeroCapacity(
+			const grb::Matrix< InputType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend const size_t & internal::getCurrentNonzeroes(
+			const grb::Matrix< InputType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend void internal::setCurrentNonzeroes(
+			grb::Matrix< InputType, nonblocking, RIT, CIT, NIT > &, const size_t
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend void internal::getMatrixBuffers(
+			char *&, char *&, InputType *&,
+			const unsigned int,
+			const grb::Matrix< InputType, nonblocking, RIT, CIT, NIT > &
+		) noexcept;
+
+		template< typename InputType, typename RIT, typename CIT, typename NIT >
+		friend uintptr_t getID(
+			const Matrix< InputType, nonblocking, RIT, CIT, NIT > &
+		);
+
+
+		private:
+
+			Matrix< D, reference, RowIndexType, ColIndexType, NonzeroIndexType > ref;
+
+			/** Our own type. */
+			typedef Matrix<
+				D, nonblocking,
+				RowIndexType, ColIndexType, NonzeroIndexType
+			> self_type;
+
+			Matrix() : ref( )
+			{}
+
+			Matrix(
+				const D *__restrict__ const _values,
+				const ColIndexType *__restrict__ const _column_indices,
+				const NonzeroIndexType *__restrict__ const _offset_array,
+				const size_t _m, const size_t _n,
+				const size_t _cap,
+				char *__restrict__ const buf1 = nullptr,
+				char *__restrict__ const buf2 = nullptr,
+				D *__restrict__ const buf3 = nullptr
+			) : ref(
+				_values, _column_indices, _offset_array,
+				_m, _n, _cap,
+				buf1, buf2, buf3
+			) {}
+
+			void moveFromOther( self_type &&other ) {
+				ref.moveFromOther( std::move( other.ref ) );
+			}
+
+			RC clear() {
+				return ref.clear();
+			}
+
+			RC resize( const size_t nonzeroes ) {
+				return ref.resize( nonzeroes );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				typename fwd_iterator
+			>
+			RC buildMatrixUnique(
+				const fwd_iterator &_start,
+				const fwd_iterator &_end
+			) {
+
+				return ref.buildMatrixUnique( _start, _end );
+			}
+
+
+		public:
+
+			/** @see Matrix::value_type */
+			typedef D value_type;
+
+			Matrix(
+				const size_t rows, const size_t columns, const size_t nz
+			) : ref( rows, columns, nz )
+			{}
+
+			Matrix( const size_t rows, const size_t columns ) : ref( rows, columns )
+			{}
+
+			/**
+			 * \internal
+			 * \todo See below code comment
+			 * \endinternal
+			 */
+			Matrix(
+				const Matrix<
+					D, nonblocking, RowIndexType, ColIndexType, NonzeroIndexType
+				> &other ) : ref( other.ref )
+			{
+				//TODO: the pipeline should be executed once level-3 primitives are
+				//      implemented. In the current implementation matrices may be used only
+				//      as the input of SpMV
+			}
+
+			Matrix( self_type &&other ) noexcept : ref( std::move( other.ref ) ) {
+				//TODO: the pipeline should be executed once level-3 primitives are
+				//      implemented. In the current implementation matrices may be used only
+				//      as the input of SpMV
+			}
+
+			self_type& operator=( self_type &&other ) noexcept {
+				ref = std::move( other.ref );
+				return *this;
+			}
+
+			~Matrix() {
+				// the pipeline is executed before memory deallocation
+				internal::le.execution( this );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D, RowIndexType, NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > begin(
+				const IOMode mode = PARALLEL,
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return ref.begin( mode, s, P );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > end(
+				const IOMode mode = PARALLEL,
+				const size_t s = 0, const size_t P = 1
+			) const {
+				return ref.end( mode, s, P );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cbegin(
+				const IOMode mode = PARALLEL
+			) const {
+				return ref.cbegin( mode );
+			}
+
+			template< class ActiveDistribution = internal::Distribution< reference > >
+			typename internal::Compressed_Storage<
+				D,
+				RowIndexType,
+				NonzeroIndexType
+			>::template ConstIterator< ActiveDistribution > cend(
+				const IOMode mode = PARALLEL
+			) const {
+				return ref.cend( mode );
+			}
+
+	};
+
+	// template specialisation for GraphBLAS type traits
+	template< typename D, typename RIT, typename CIT, typename NIT >
+	struct is_container< Matrix< D, nonblocking, RIT, CIT, NIT > > {
+		/** A nonblocking Matrix is a GraphBLAS object. */
+		static const constexpr bool value = true;
+	};
+
+	//internal getters implementation
+	namespace internal {
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		inline Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return (A.ref);
+		}
+
+		template< typename DataType, typename RIT, typename CIT, typename NIT >
+		inline const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix(
+			const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return (A.ref);
+		}
+
+	} //end ``grb::internal'' namespace
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_MATRIX''
+
diff --git a/include/graphblas/nonblocking/pinnedvector.hpp b/include/graphblas/nonblocking/pinnedvector.hpp
new file mode 100644
index 000000000..3e8428d66
--- /dev/null
+++ b/include/graphblas/nonblocking/pinnedvector.hpp
@@ -0,0 +1,163 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * PinnedVector implementation of the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_PINNEDVECTOR
+#define _H_GRB_NONBLOCKING_PINNEDVECTOR
+
+#include <graphblas/base/pinnedvector.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+	/**
+	 * The PinnedVector class is based on that of the reference backend.
+	 *
+	 * \internal There is some code duplication with the reference PinnedVector.
+	 *           At present, it is unclear if this can be reduced.
+	 */
+	template< typename IOType >
+	class PinnedVector< IOType, nonblocking > {
+
+		private:
+
+			/** Essentially a shared pointer into the nonzero values */
+			utils::AutoDeleter< IOType > _raw_deleter;
+
+			/** Essentially a shared pointer into the SPA's stack. */
+			utils::AutoDeleter< char > _stack_deleter;
+
+			/** The shared nonzero values */
+			IOType * _buffered_values;
+
+			/**
+			 * The shared coordinates, on which only stack-based accesses are performed.
+			 */
+			internal::Coordinates<
+				config::IMPLEMENTATION< nonblocking >::coordinatesBackend()
+			> _buffered_coordinates;
+
+
+		public:
+
+			/** Constructs an empty pinned vector. */
+			PinnedVector() : _buffered_values( nullptr ) {}
+
+			/** Constructs a pinning of \a x */
+			PinnedVector(
+				const Vector< IOType, nonblocking, internal::Coordinates<
+					config::IMPLEMENTATION< nonblocking >::coordinatesBackend()
+				> > &x,
+				const IOMode mode
+			) {
+				// The execution of a pipeline that uses the vector is necessary.
+				if( internal::getCoordinates(x).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+				_raw_deleter = internal::getRefVector(x)._raw_deleter;
+				_stack_deleter = internal::getRefVector(x)._buffer_deleter;
+				_buffered_values = internal::getRefVector(x)._raw;
+				_buffered_coordinates = internal::getRefVector(x)._coordinates;
+
+				// The nonblocking backend is always single process, so the mode is unused.
+				(void) mode;
+			}
+
+			/** \internal No implementation details */
+			inline size_t size() const noexcept {
+#ifndef NDEBUG
+				if( _buffered_coordinates.size() == 0 ) {
+					assert( _buffered_values == nullptr );
+				}
+#endif
+				return _buffered_coordinates.size();
+			}
+
+			/** \internal No implementation details */
+			inline size_t nonzeroes() const noexcept {
+#ifndef NDEBUG
+				if( _buffered_coordinates.size() == 0 ) {
+					assert( _buffered_values == nullptr );
+				}
+#endif
+				return _buffered_coordinates.nonzeroes();
+			}
+
+			/** \internal No implementation details */
+			template< typename OutputType = IOType >
+			inline OutputType getNonzeroValue(
+				const size_t k,
+				const OutputType one
+			) const noexcept {
+				assert( k < nonzeroes() );
+				assert( _buffered_coordinates.size() > 0 );
+				if( _buffered_values == nullptr ) {
+					return one;
+				} else {
+					const size_t index = getNonzeroIndex( k );
+					return static_cast< OutputType >(
+						_buffered_values[ index ]
+					);
+				}
+			}
+
+			/** \internal No implementation details */
+			inline IOType getNonzeroValue(
+				const size_t k
+			) const noexcept {
+				assert( k < nonzeroes() );
+				assert( _buffered_coordinates.size() > 0 );
+				assert( _buffered_values != nullptr );
+				const size_t index = getNonzeroIndex( k );
+				assert( index < _buffered_coordinates.size() );
+				return _buffered_values[ index ];
+			}
+
+			/** \internal No implementation details */
+			inline size_t getNonzeroIndex(
+				const size_t k
+			) const noexcept {
+				assert( k < nonzeroes() );
+				return _buffered_coordinates.index( k );
+			}
+
+	};
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_PINNEDVECTOR''
+
diff --git a/include/graphblas/nonblocking/pipeline.hpp b/include/graphblas/nonblocking/pipeline.hpp
new file mode 100644
index 000000000..62500d115
--- /dev/null
+++ b/include/graphblas/nonblocking/pipeline.hpp
@@ -0,0 +1,346 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Describes a pipeline.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_PIPELINE
+#define _H_GRB_NONBLOCKING_PIPELINE
+
+/**
+ * To enable debugging information only for the nonblocking backend, the code
+ * should be combiled with the _NONBLOCKING_DEBUG definition, without defining
+ * _DEBUG. If the code is compiled with _DEBUG, the debugging information for
+ * the nonblocking backend is enabled as well.
+ */
+#if !defined(_NONBLOCKING_DEBUG) && defined(_DEBUG)
+ #define _NONBLOCKING_DEBUG
+#endif
+
+/**
+ * The GRB_ALREADY_DENSE_OPTIMIZATION definition is used for easily enabling and
+ * disabling the optimization for already dense vectors to avoid the overhead of
+ * the local coordinates. This is very useful for comparing the performance
+ * between the different versions. The optimization is enabled by default.
+ */
+#define GRB_ALREADY_DENSE_OPTIMIZATION
+
+/**
+ * The GRB_BOOLEAN_DISPATCHER definition is related to the optimization for
+ * already dense vectors, and it is used to easily choose between two different
+ * implementations:
+ * - one that uses formal parameters for variables that indicate if a vector is
+ *   dense; and
+ * - another one that uses template parameters for those variables.
+ * The first one implies runtime overhead, and the second one requires some
+ * additional code, which selects the values for the template parameters, and it
+ * is defined in boolean_dispatcher_io.hpp, boolean_dispatcher_blas1.hpp, and
+ * boolean_dispatcher_blas2.hpp.
+ * A preliminary evaluation does not confirm that the first implementation is
+ * slower. Therefore, we temporarily maintain both implementations to conduct
+ * further evaluation.
+ */
+#define GRB_BOOLEAN_DISPATCHER
+
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <functional>
+
+#include <graphblas/backends.hpp>
+
+#include "coordinates.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		/** Operation codes of primitives that may enter a dynamic pipeline. */
+		enum class Opcode {
+			IO_SET_SCALAR,
+			IO_SET_MASKED_SCALAR,
+			IO_SET_VECTOR,
+			IO_SET_MASKED_VECTOR,
+
+			BLAS1_FOLD_VECTOR_SCALAR_GENERIC,
+			BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+			BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+			BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+			BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+			BLAS1_EWISEAPPLY,
+			BLAS1_MASKED_EWISEAPPLY,
+			BLAS1_EWISEMULADD_DISPATCH,
+			BLAS1_DOT_GENERIC,
+			BLAS1_EWISELAMBDA,
+			BLAS1_EWISEMAP,
+			BLAS1_ZIP,
+			BLAS1_UNZIP,
+
+			BLAS2_VXM_GENERIC
+		};
+
+		/**
+		 * Encodes a single pipeline that may be expanded, merged, or executed.
+		 */
+		class Pipeline {
+
+			public:
+
+				// The pipeline is passed by reference such that an out-of-place operation
+				// can disable the dense descriptor and remove the coordinates of the empty
+				// vector from the list.
+				typedef std::function<
+						RC( Pipeline &, const size_t, const size_t )
+					> stage_type;
+
+
+			private:
+
+				size_t containers_size;
+				size_t size_of_data_type;
+				std::vector< stage_type > stages;
+				std::vector< Opcode > opcodes;
+
+				std::set< Coordinates< nonblocking > * > accessed_coordinates;
+				std::set< const void * > input_vectors;
+				std::set< const void * > output_vectors;
+				std::set< const void * > vxm_input_vectors;
+
+				/**
+				 * The following vectors are used temporarily by the execution method.
+				 * They are declared as members of the class to pre-allocate memory once.
+				 */
+				std::vector< size_t > lower_bound;
+				std::vector< size_t > upper_bound;
+				std::vector< const void * > input_output_intersection;
+
+				/**
+				 * In the current implementation that supports level-1 and level-2
+				 * operations, pointers to the input matrices are used only for triggering
+				 * the pipeline execution, e.g., in the destructor of the class Matrix.
+				 * \todo Once level-3 operations are supported, they will be used in a
+				 *       similar way as vectors.
+				 */
+				std::set< const void * > input_matrices;
+
+				/**
+				 * Indicates that the pipeline contains an out-of-place operation, which
+				 * may clear the output vector and break any guarantees of already dense
+				 * vectors.
+				 */
+				bool contains_out_of_place_primitive;
+
+				/**
+				 * Stores the set of output vectors of the out-of-place operations executed
+				 * in the pipeline. It's used by the execution method to ensure that an
+				 * already dense vector will remain dense after the execution of the
+				 * pipeline, i.e., the vector is not the output of an out-of-place
+				 * operation.
+				 */
+				std::set< const Coordinates< nonblocking > * >
+					out_of_place_output_coordinates;
+
+				/**
+				 * Indicates that all the vectors are already dense before the execution
+				 * of the pipeline, and thus enabling runtime optimizations.
+				 */
+				bool all_already_dense_vectors;
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				/**
+				 * Maintains the coordinates of vectors that are already dense to enable
+				 * optimizations.
+				 * The set is built explicitly before the execution of the pipeline in the
+				 * execution method.
+				 */
+				std::set< const Coordinates< nonblocking > * > already_dense_coordinates;
+#endif
+				/**
+				 * This set of vectors is used for the verification for correct usage of the
+				 * dense descriptor that takes place after the execution of the pipeline.
+				 * The set is built when stages are added into the pipeline.
+				 */
+				std::set< Coordinates< nonblocking > * > dense_descr_coordinates;
+
+				/**
+				 * Whether a warning on container capacities increased beyond their initial
+				 * capacities has been emitted.
+				 */
+				bool no_warning_emitted_yet;
+
+				/**
+				 * Function that checks if current container capacities have exceeded their
+				 * initial capacity.
+				 */
+				void warnIfExceeded();
+
+
+			public:
+
+				/**
+				 * Constructs a pipeline with given initial container, stage, and tile
+				 * capacities.
+				 *
+				 * If during pipeline construction these initial capacities are exceeded, a
+				 * warning may be emitted (see #grb::config::PIPELINE::warn_if_exceeded).
+				 */
+				Pipeline();
+
+				Pipeline( const Pipeline &pipeline );
+				Pipeline( Pipeline &&pipeline ) noexcept;
+
+				Pipeline &operator=( const Pipeline &pipeline );
+				Pipeline &operator=( Pipeline &&pipeline );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				bool allAlreadyDenseVectors() const;
+#endif
+				bool empty() const;
+
+				typename std::vector< stage_type >::iterator pbegin();
+				typename std::vector< stage_type >::iterator pend();
+				typename std::set< Coordinates< nonblocking > * >::iterator vbegin();
+				typename std::set< Coordinates< nonblocking > * >::iterator vend();
+
+				size_t accessedCoordinatesSize() const;
+				size_t getNumStages() const;
+				size_t getContainersSize() const;
+
+				/**
+				 * @param[in]  func   The lambda function executed by this operation (stage)
+				 * @param[in]  opcode The operation code used as an identifier
+				 * @param[in]  n      The size of the containers handled by this operation
+				 * @param[in]  data_type_size The size of the data used in this operation
+				 *                         required by the analytic model
+				 * @param[in]  dense_descr Indicates that all vectors are dense before the
+				 *                         execution of the operation. Used for the
+				 *                         already dense optimization and the dense
+				 *                         descriptor verification.
+				 * @param[in]  dense_mask  Used only by the masked out-of-place operations.
+				 * @param[out] output_vector_ptr A pointer to the output vector, equal to
+				 *                         nullptr for operations that return a scalar.
+				 * @param[out] output_aux_vector_ptr A pointer to the second output vector,
+				 *                         equal to nullptr except for the unzip operation.
+				 * @param[out] coor_output_ptr A pointer to the coordinates of the output.
+				 * @param[out] coor_output_aux_ptr A pointer to the coordinates of the
+				 *                         second output.
+				 * @param[in]  input_a_ptr A pointer to the first input vector, it may be
+				 *                         equal to nullptr if the input is a scalar.
+				 * @param[in]  input_b_ptr A pointer to the second input vector, equal to
+				 *                         nullptr if a second input does not exist.
+				 * @param[in]  input_c_ptr A pointer to the third input vector, equal to
+				 *                         nullptr if a third input does not exist.
+				 * @param[in]  input_d_ptr A pointer to the fourth input vector, equal to
+				 *                         nullptr if a fourth input does not exist.
+				 * @param[in]  coor_a_ptr  A pointer to the coordinates of the first input
+				 *                         vector, it may be equal to nullptr if the input
+				 *                         is a scalar.
+				 * @param[in]  coor_b_ptr  A pointer to the coordinates of the second input
+				 *                         vector, equal to nullptr if a second input does
+				 *                         not exist.
+				 * @param[in]  coor_c_ptr  A pointer to the coordinates of the third input
+				 *                         vector, equal to nullptr if a third input does
+				 *                         not exist.
+				 * @param[in]  coor_d_ptr  A pointer to the coordinates of the fourth input
+				 *                         vector, equal to nullptr if a fourth input does
+				 *                         not exist.
+				 *
+				 * \todo in the current implementation:
+				 *
+				 * @param[in]  input_matrix A pointer to the input matrix of SpMV.
+				 */
+				void addStage(
+					const stage_type &&func,
+					const Opcode opcode,
+					const size_t n,
+					const size_t data_type_size,
+					const bool dense_descr,
+					const bool dense_mask,
+					void * const output_vector_ptr,
+					void * const output_aux_vector_ptr,
+					Coordinates< nonblocking > * const coor_output_ptr,
+					Coordinates< nonblocking > * const coor_output_aux_ptr,
+					const void * const input_a_ptr,
+					const void * const input_b_ptr,
+					const void * const input_c_ptr,
+					const void * const input_d_ptr,
+					const Coordinates< nonblocking > * const coor_a_ptr,
+					const Coordinates< nonblocking > * const coor_b_ptr,
+					const Coordinates< nonblocking > * const coor_c_ptr,
+					const Coordinates< nonblocking > * const coor_d_ptr,
+					const void * const input_matrix
+				);
+
+				void addeWiseLambdaStage(
+					const stage_type &&func,
+					const Opcode opcode,
+					const size_t n,
+					const size_t data_type_size,
+					const bool dense_descr,
+					std::vector< const void * > all_vectors_ptr,
+					const Coordinates< nonblocking > * const coor_a_ptr
+				);
+
+				bool accessesInputVector( const void * const vector ) const;
+				bool accessesOutputVector( const void * const vector ) const;
+				bool accessesVector( const void * const vector ) const;
+				bool accessesMatrix( const void * const matrix ) const;
+
+				bool overwritesVXMInputVectors( const void * const output_vector_ptr )
+					const;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				bool emptyAlreadyDenseVectors() const;
+				bool containsAlreadyDenseVector(
+					const Coordinates< nonblocking > * const vector_ptr
+				) const;
+				void markMaybeSparseVector(
+					const Coordinates< nonblocking > * const vector_ptr
+				);
+#endif
+				void markMaybeSparseDenseDescriptorVerification(
+					Coordinates< nonblocking > * const vector_ptr
+				);
+
+				bool outOfPlaceOutput(
+					const Coordinates< nonblocking > * const vector_ptr
+				);
+
+				void merge( Pipeline &pipeline );
+
+				void clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				void buildAlreadyDenseVectors();
+#endif
+				RC verifyDenseDescriptor();
+
+				RC execution();
+
+		};
+
+	}
+
+}
+
+#endif //end `_H_GRB_NONBLOCKING_PIPELINE'
+
diff --git a/include/graphblas/nonblocking/properties.hpp b/include/graphblas/nonblocking/properties.hpp
new file mode 100644
index 000000000..9f91e6557
--- /dev/null
+++ b/include/graphblas/nonblocking/properties.hpp
@@ -0,0 +1,58 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collects the nonblocking backend properties.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_PROPERTIES
+#define _H_GRB_NONBLOCKING_PROPERTIES
+
+#include <graphblas/base/properties.hpp>
+
+
+namespace grb {
+
+	/** No implementation notes. */
+	template<>
+	class Properties< nonblocking > {
+
+		public:
+
+			/**
+			 * This is a shared-memory parallel implementation and therefore captured
+			 * scalars cannot be written to without causing data races.
+			 */
+			static constexpr const bool writableCaptured = false;
+
+			/** This is a nonblocking backend. */
+			static constexpr const bool isBlockingExecution = false;
+
+			/** This is a nonblocking backend. */
+			static constexpr const bool isNonblockingExecution = true;
+
+	};
+
+} // namespace grb
+
+#endif // end `_H_GRB_NONBLOCKING_PROPERTIES
+
diff --git a/include/graphblas/nonblocking/spmd.hpp b/include/graphblas/nonblocking/spmd.hpp
new file mode 100644
index 000000000..126d50f33
--- /dev/null
+++ b/include/graphblas/nonblocking/spmd.hpp
@@ -0,0 +1,68 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the SPMD functions for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_SPMD
+#define _H_GRB_NONBLOCKING_SPMD
+
+#include <cstddef> //size_t
+
+#include <graphblas/base/spmd.hpp>
+
+
+namespace grb {
+
+	/** The spmd class is based on that of the reference backend */
+	template<>
+	class spmd< nonblocking > {
+
+		public:
+
+			/** Refers back to the reference backend */
+			static inline size_t nprocs() noexcept {
+				return spmd< reference >::nprocs();
+			}
+
+			/** Refers back to the reference backend */
+			static inline size_t pid() noexcept {
+				return spmd< reference >::pid();
+			}
+
+			/** Refers back to the reference backend */
+			static RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
+				return spmd< reference >::sync( msgs_in, msgs_out );
+			}
+
+			/** Refers back to the reference backend */
+			static RC barrier() noexcept {
+				return spmd< reference >::barrier();
+			}
+
+	}; // end class ``spmd'' nonblocking implementation
+
+} // namespace grb
+
+#endif // end _H_GRB_NONBLOCKING_SPMD
+
diff --git a/include/graphblas/nonblocking/vector.hpp b/include/graphblas/nonblocking/vector.hpp
new file mode 100644
index 000000000..9b57d8c24
--- /dev/null
+++ b/include/graphblas/nonblocking/vector.hpp
@@ -0,0 +1,478 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the nonblocking vector.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_VECTOR
+#define _H_GRB_NONBLOCKING_VECTOR
+
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+
+#include <graphblas/init.hpp>
+#include <graphblas/backends.hpp>
+#include <graphblas/base/matrix.hpp>
+#include <graphblas/base/pinnedvector.hpp>
+#include <graphblas/base/vector.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/config.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/distribution.hpp>
+#include <graphblas/iomode.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/type_traits.hpp>
+#include <graphblas/utils/alloc.hpp>
+#include <graphblas/utils/autodeleter.hpp>
+
+#include <graphblas/reference/compressed_storage.hpp>
+
+#include "coordinates.hpp"
+#include "spmd.hpp"
+#include "lazy_evaluation.hpp"
+
+#define NO_CAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                          \
+		"\n\n"                                                                 \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"*     ERROR      | " y " " z ".\n"                                    \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n"                                     \
+		"* Possible fix 1 | Remove no_casting from the template parameters "   \
+		"in this call to " y ".\n"                                             \
+		"* Possible fix 2 | Provide a value of the same type as the first "    \
+		"domain of the given accumulator.\n"                                   \
+		"* Possible fix 3 | Provide a compatible accumulator where the first " \
+		"domain is of the type of the given value in the template paramters "  \
+		"of this call to " y ".\n"                                             \
+		"********************************************************************" \
+		"********************************************************************" \
+		"******************************\n" );
+
+#define NO_MASKCAST_ASSERT( x, y, z )                                              \
+	static_assert( x,                                                              \
+		"\n\n"                                                                     \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n"                                         \
+		"*     ERROR      | " y " " z ".\n"                                        \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n"                                         \
+		"* Possible fix 1 | Remove no_casting from the template parameters "       \
+		"in this call to " y ".\n"                                                 \
+		"* Possible fix 2 | Provide a vector of Booleans in this call to " y ".\n" \
+		"********************************************************************"     \
+		"********************************************************************"     \
+		"******************************\n" );
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+}
+
+namespace grb {
+
+	// forward declaration of backend-local matrix specialization for vector's
+	// friends
+	template< typename D, typename RIT, typename CIT, typename NIT >
+	class Matrix< D, nonblocking, RIT, CIT, NIT >;
+
+	// forward-declare internal getters
+	namespace internal {
+
+		template< typename D, typename C >
+		inline C & getCoordinates( Vector< D, nonblocking, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const C & getCoordinates(
+			const Vector< D, nonblocking, C > &x
+		) noexcept;
+
+		template< typename D, typename C >
+		inline D * getRaw( Vector< D, nonblocking, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const D * getRaw( const Vector< D, nonblocking, C > &x ) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			const Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			const Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept;
+
+		template< typename D, typename C >
+		inline Vector< D, reference, C >& getRefVector(
+			Vector< D, nonblocking, C > &x ) noexcept;
+
+		template< typename D, typename C >
+		inline const Vector< D, reference, C >& getRefVector(
+			const Vector< D, nonblocking, C > &x ) noexcept;
+
+	} // namespace internal
+
+	template< typename D, typename MyCoordinates >
+	class Vector< D, nonblocking, MyCoordinates > {
+
+		static_assert( !grb::is_object< D >::value, "Cannot create an ALP/GraphBLAS"
+			"vector of ALP/GraphBLAS objects!" );
+
+		/* *********************
+		     `Getter' friends
+		   ********************* */
+
+		friend MyCoordinates & internal::getCoordinates< D, MyCoordinates >(
+			Vector< D, nonblocking, MyCoordinates > & x ) noexcept;
+
+		friend const MyCoordinates & internal::getCoordinates< D, MyCoordinates >(
+			const Vector< D, nonblocking, MyCoordinates > & x ) noexcept;
+
+		friend D * internal::getRaw< D, MyCoordinates >(
+			Vector< D, nonblocking, MyCoordinates > & x ) noexcept;
+
+		friend const D * internal::getRaw< D, MyCoordinates >(
+			const Vector< D, nonblocking, MyCoordinates > & x ) noexcept;
+
+		friend Vector< D, reference, MyCoordinates > & internal::getRefVector<>(
+			Vector< D, nonblocking, MyCoordinates > &x ) noexcept;
+
+		friend const Vector< D, reference, MyCoordinates > & internal::getRefVector<>(
+			const Vector< D, nonblocking, MyCoordinates > &x ) noexcept;
+
+		/* *********************
+		        IO friends
+		   ********************* */
+
+		friend class PinnedVector< D, nonblocking >;
+
+
+		private:
+
+			Vector< D, reference, MyCoordinates > ref;
+
+
+		public:
+
+			/** @see Vector::value_type. */
+			typedef D value_type;
+
+			/**
+			 * This implementation makes the simplest implementation choice and declares
+			 * a lambda reference to be of the same type as a regular C++ reference. The
+			 * restrictions as specified in Vector::lambda_reference, however, still
+			 * apply.
+			 *
+			 * @see Vector::lambda_reference for the user-level specification.
+			 */
+			typedef D & lambda_reference;
+
+			typedef typename Vector< D, reference, MyCoordinates >::const_iterator
+				const_iterator;
+
+
+			Vector( const size_t n, const size_t nz ) : ref( n, nz ) {}
+
+			Vector( const size_t n ) : Vector( n, n ) {
+
+				// pipeline execution is not required here as this is a grb::Vector
+				// declaration
+#ifdef _DEBUG
+				std::cerr << "In Vector< nonblocking >::Vector( size_t ) constructor\n";
+#endif
+			}
+
+			Vector() : Vector( 0 ) {}
+
+			Vector( const Vector< D, nonblocking, MyCoordinates > &x ) :
+				ref( size( x.ref ), capacity( x.ref ) )
+			{
+				// full delegation to the copy constructor of the reference backend is
+				// impossible since the pipeline must be executed before the copy
+				// constructor
+				// instead a parameterized constructor of the reference backend is invoked
+				// to perform the necessary initialization as the initialize method is not
+				// defined for the nonblocking backend
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+
+				// once the execution of any required pipeline is completed
+				// the set primitive initializes the vector for this copy constructor
+				if( size( x ) > 0 ) {
+					const RC rc = set( *this, x );
+					if( rc != SUCCESS ) {
+						throw std::runtime_error( "grb::set inside copy-constructor: "
+							+ toString( rc ) );
+					}
+				}
+			}
+
+			Vector( Vector< D, nonblocking, MyCoordinates > &&x ) noexcept {
+
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+
+				ref = std::move( x.ref );
+			}
+
+			Vector< D, nonblocking, MyCoordinates > & operator=(
+				const Vector< D, nonblocking, MyCoordinates > &x
+			) {
+				const RC rc = set( *this, x );
+				if( rc != grb::SUCCESS ) {
+					throw std::runtime_error( grb::toString( rc ) );
+				}
+				return *this;
+			}
+
+			Vector< D, nonblocking, MyCoordinates > & operator=(
+				Vector< D, nonblocking, MyCoordinates > &&x
+			) noexcept {
+				if( internal::getCoordinates( x ).size() > 0 ) {
+					internal::le.execution( &x );
+				}
+				ref = std::move( x.ref );
+				return *this;
+			}
+
+			~Vector() {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+			}
+
+			const_iterator begin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.begin(s, P);
+			}
+
+			const_iterator end(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.end(s, P);
+			}
+
+			const_iterator cbegin(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.cbegin(s, P);
+			}
+
+			const_iterator cend(
+				const size_t s = 0, const size_t P = 1
+			) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.cend(s, P);
+			}
+
+			template< Descriptor descr = descriptors::no_operation,
+				typename mask_type,
+				class Accum,
+				typename ind_iterator = const size_t * __restrict__,
+				typename nnz_iterator = const D * __restrict__,
+				class Dup = operators::right_assign<
+					D, typename nnz_iterator::value_type, D
+				>
+			>
+			RC build(
+				const Vector< mask_type, nonblocking, MyCoordinates > &mask,
+				const Accum &accum,
+				const ind_iterator ind_start,
+				const ind_iterator ind_end,
+				const nnz_iterator nnz_start,
+				const nnz_iterator nnz_end,
+				const Dup &dup = Dup()
+			) {
+				return ref.build( mask.ref, accum, ind_start, ind_end, nnz_start, nnz_end,
+					dup );
+			}
+
+			template<
+				Descriptor descr = descriptors::no_operation,
+				class Accum = operators::right_assign< D, D, D >,
+				typename T, typename mask_type = bool
+			>
+			RC assign(
+				const T &val,
+				const Vector< mask_type, nonblocking, MyCoordinates > &mask,
+				const Accum &accum = Accum()
+			) {
+				return ref.assign( val, mask.ref, accum );
+			}
+
+			template< typename T >
+			RC nnz( T &nnz ) const {
+				if( internal::getCoordinates( *this ).size() > 0 ) {
+					internal::le.execution( this );
+				}
+
+				return ref.nnz( nnz );
+			}
+
+			D * raw() const {
+				return ref.raw();
+			}
+
+			lambda_reference operator[]( const size_t i ) {
+				return ref[ i ];
+			}
+
+			lambda_reference operator[]( const size_t i ) const {
+				return ref[ i ];
+			}
+
+	};
+
+	// specialisation for GraphBLAS type_traits
+	template< typename D, typename Coord >
+	struct is_container< Vector< D, nonblocking, Coord > > {
+		/** A nonblocking vector is a GraphBLAS object. */
+		static const constexpr bool value = true;
+	};
+
+	// internal getters implementation
+	namespace internal {
+
+		template< typename D, typename C >
+		inline C & getCoordinates( Vector< D, nonblocking, C > &x ) noexcept {
+			return internal::getCoordinates( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline const C & getCoordinates(
+			const Vector< D, nonblocking, C > &x
+		) noexcept {
+			return internal::getCoordinates( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline D * getRaw( Vector< D, nonblocking, C > &x ) noexcept {
+			return getRaw( x.ref );
+		}
+
+		template< typename D, typename C >
+		inline const D * getRaw( const Vector< D, nonblocking, C > &x ) noexcept {
+			return getRaw( x.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCRS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS(
+			const Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCRS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCCS( A.ref );
+		}
+
+		template< typename D, typename RIT, typename CIT, typename NIT >
+		inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS(
+			const Matrix< D, nonblocking, RIT, CIT, NIT > &A
+		) noexcept {
+			return getCCS( A.ref );
+		}
+
+		template< typename D, typename C >
+		inline Vector< D, reference, C >& getRefVector(
+			Vector< D, nonblocking, C > &x
+		) noexcept {
+			return x.ref;
+		}
+
+		template< typename D, typename C >
+		inline const Vector< D, reference, C >& getRefVector(
+			const Vector< D, nonblocking, C > &x
+		) noexcept {
+			return x.ref;
+		}
+
+	} // namespace internal
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+#undef NO_MASKCAST_ASSERT
+
+#endif // end ``_H_GRB_NONBLOCKING_VECTOR''
+
diff --git a/include/graphblas/nonblocking/vector_wrapper.hpp b/include/graphblas/nonblocking/vector_wrapper.hpp
new file mode 100644
index 000000000..b66e21662
--- /dev/null
+++ b/include/graphblas/nonblocking/vector_wrapper.hpp
@@ -0,0 +1,191 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides a wrapper to a scalar or a vector, for those primitives that could
+ * take either.
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_VECTOR_WRAPPER
+#define _H_GRB_NONBLOCKING_VECTOR_WRAPPER
+
+#include <graphblas/backends.hpp>
+#include <graphblas/blas0.hpp>
+#include <graphblas/descriptors.hpp>
+#include <graphblas/internalops.hpp>
+#include <graphblas/ops.hpp>
+#include <graphblas/rc.hpp>
+#include <graphblas/semiring.hpp>
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "blas1.hpp"
+
+
+namespace grb {
+
+	namespace internal {
+
+		/**
+		 * A wrapper class used to store a scalar value, which is passed by value to
+		 * an internal function used by an ALP/GraphBLAS operation. The wrapper
+		 * classes are used by operations that may have a formal parameter that is
+		 * either a scalar or a vector, because the implementation is generic and
+		 * handles all possible cases.
+		 */
+		template< bool scalar, typename InputType,  typename CoordinatesType >
+		class Wrapper {
+
+			private:
+
+				/**
+				 * \warning This is not a reference, since the semantics are that the
+				 *          \em current scalar value is used.
+				 */
+				InputType val;
+
+
+			public:
+
+				/** Base constructor that copies the input scalar. */
+				Wrapper(const InputType &value) : val( value ) {}
+
+				/** Default copy constructor. */
+				Wrapper( const Wrapper< scalar, InputType, CoordinatesType > & ) = default;
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns a raw array for vectors only).
+				 */
+				constexpr InputType * getRaw() const {
+					return nullptr;
+				}
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns coordinates only for vectors.
+				 */
+				constexpr CoordinatesType * getCoordinates() const {
+					return nullptr;
+				}
+
+				/**
+				 * @returns <tt>nullptr</tt>
+				 *
+				 * This function returns a vector pointer only when wrapping a vector.
+				 */
+				constexpr Vector< InputType, nonblocking, CoordinatesType > * getPointer()
+					const
+				{
+					return nullptr;
+				}
+
+				/**
+				 * @returns The scalar value it wraps.
+				 */
+				const InputType & getValue() const {
+					return val;
+				}
+
+				/**
+				 * @returns Whether the underlying container is dense.
+				 */
+				bool isDense() const {
+					return true;
+				}
+
+		};
+
+		/**
+		 * A wrapper class used to store a vector, which is passed by reference to an
+		 * internal function used by an ALP/GraphBLAS operation. The wrapper classes
+		 * are used by by operations that may have a formal parameter that is either a
+		 * scalar or a vector, because the implementation is generic and handles all
+		 * possible cases.
+		 */
+		template< typename InputType,  typename CoordinatesType >
+		class Wrapper< false, InputType, CoordinatesType > {
+
+			private:
+
+				/** A reference to the vector this class wraps. */
+				const Vector< InputType, nonblocking, CoordinatesType > &vec;
+
+
+			public:
+
+				/** Base constructor wrapping arund a given \a vector. */
+				Wrapper( const Vector< InputType, nonblocking, CoordinatesType > &vector ) :
+					vec( vector )
+				{}
+
+				/** Copy constructor. */
+				Wrapper( const Wrapper< false, InputType, CoordinatesType > &w ) :
+					vec( w.vec )
+				{}
+
+				/** @returns The underlying raw value array. */
+				const InputType * getRaw() const {
+					return internal::getRaw( vec );
+				}
+
+				/** @returns The underlying coordinates instance. */
+				const CoordinatesType * getCoordinates() const {
+					return &internal::getCoordinates( vec );
+				}
+
+				/** @returns The underlying vector (a pointer to it). */
+				const Vector< InputType, nonblocking, CoordinatesType > * getPointer()
+					const
+				{
+					return &vec;
+				}
+
+				/**
+				 * @returns a possibly unitialised value that is not intended to be
+				 *          consumed.
+				 *
+				 * \warning This function should only be called on wrappers of scalars.
+				 */
+				const InputType & getValue() const {
+					// this is a trick to avoid compilation errors, this value will never be
+					// used in practice
+					return *( getRaw( ) );
+				}
+
+				/**
+				 * @returns Whether the underlying vector is dense.
+				 */
+				bool isDense() const {
+					return internal::getCoordinates( vec ).isDense();
+				}
+		};
+
+	} // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/ops.hpp b/include/graphblas/ops.hpp
index 1275832c0..45e233bc6 100644
--- a/include/graphblas/ops.hpp
+++ b/include/graphblas/ops.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides a set of standard binary operators.
+ *
  * @author A. N. Yzelman
  * @date 8th of August, 2016
  */
@@ -43,29 +47,53 @@ namespace grb {
 		 *
 		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
 		 */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class left_assign : public internal::Operator< internal::left_assign< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = left_assign< A, B, C, D >;
-			left_assign() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class left_assign :
+			public internal::Operator<
+			internal::left_assign< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = left_assign< A, B, C, D >;
+
+				left_assign() {}
+
 		};
 
-		/** TODO documentation. */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class left_assign_if : public internal::Operator< internal::left_assign_if< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = left_assign_if< A, B, C, D >;
-			left_assign_if() {}
+		/**
+		 * This operator assigns the left-hand input if the right-hand input
+		 * evaluates <tt>true</tt>. If the right-hand input does not evaluate
+		 * <tt>true</tt>, then the output field is unmodified.
+		 *
+		 * \warning Therefore, this operator may propagate the use of uninitialised
+		 *          values if not used with care. Ensuring its use with in-place
+		 *          primitives is recommended.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class left_assign_if :
+			 public internal::Operator<
+					internal::left_assign_if< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = left_assign_if< A, B, C, D >;
+
+				left_assign_if() {}
+
 		};
 
 		/**
@@ -76,29 +104,51 @@ namespace grb {
 		 *
 		 * Mathematical notation: \f$ \odot(x,y)\ \to\ y \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
 		 */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class right_assign : public internal::Operator< internal::right_assign< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = right_assign< A, B, C, D >;
-			right_assign() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class right_assign : public internal::Operator<
+				internal::right_assign< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = right_assign< A, B, C, D >;
+
+				right_assign() {}
+
 		};
 
-		/** TODO documentation. */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class right_assign_if : public internal::Operator< internal::right_assign_if< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = right_assign_if< A, B, C, D >;
-			right_assign_if() {}
+		/**
+		 * This operator assigns the right-hand input if the left-hand input
+		 * evaluates <tt>true</tt>. If the left-hand input does not evaluate
+		 * <tt>true</tt>, then the output field is unmodified.
+		 *
+		 * \warning Therefore, this operator may propagate the use of uninitialised
+		 *          values if not used with care. Ensuring its use with in-place
+		 *          primitives is recommended.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class right_assign_if : public internal::Operator<
+				internal::right_assign_if< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = right_assign_if< A, B, C, D >;
+
+				right_assign_if() {}
+
 		};
 
 		/**
@@ -109,10 +159,6 @@ namespace grb {
 		 *
 		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x + y \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
@@ -122,12 +168,20 @@ namespace grb {
 		 *          available.
 		 */
 		// [Operator Wrapping]
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class add : public internal::Operator< internal::add< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = add< A, B, C, D >;
-			add() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class add : public internal::Operator<
+			internal::add< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = add< A, B, C, D >;
+
+				add() {}
 		};
 		// [Operator Wrapping]
 
@@ -139,10 +193,6 @@ namespace grb {
 		 *
 		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x \cdot y \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
@@ -151,12 +201,20 @@ namespace grb {
 		 *          \a D3, or types that have the appropriate operator*-functions
 		 *          available.
 		 */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class mul : public internal::Operator< internal::mul< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = mul< A, B, C, D >;
-			mul() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class mul : public internal::Operator<
+			internal::mul< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = mul< A, B, C, D >;
+
+				mul() {}
 		};
 
 		/**
@@ -169,10 +227,6 @@ namespace grb {
 		 *    x \text{ if } x > y \\
 		 *    y \text{ otherwise} \end{cases} \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
@@ -180,12 +234,20 @@ namespace grb {
 		 * \warning This operator expects objects with a partial ordering defined on
 		 *          and between elements of types \a D1, \a D2, and \a D3.
 		 */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class max : public internal::Operator< internal::max< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = max< A, B, C, D >;
-			max() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class max : public internal::Operator<
+			internal::max< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = max< A, B, C, D >;
+
+				max() {}
 		};
 
 		/**
@@ -198,10 +260,6 @@ namespace grb {
 		 *    x \text{ if } x < y \\
 		 *    y \text{ otherwise} \end{cases} \f$.
 		 *
-		 * \note A proper GraphBLAS program never uses the interface exposed by this
-		 *       operator directly, and instead simply passes the operator on to
-		 *       GraphBLAS functions.
-		 *
 		 * @tparam D1 The left-hand side input domain.
 		 * @tparam D2 The right-hand side input domain.
 		 * @tparam D3 The output domain.
@@ -209,143 +267,541 @@ namespace grb {
 		 * \warning This operator expects objects with a partial ordering defined on
 		 *          and between elements of types \a D1, \a D2, and \a D3.
 		 */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class min : public internal::Operator< internal::min< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = min< A, B, C, D >;
-			min() {}
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class min : public internal::Operator<
+				internal::min< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = min< A, B, C, D >;
+
+				min() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class subtract : public internal::Operator< internal::substract< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = subtract< A, B, C, D >;
-			subtract() {}
+		/**
+		 * Numerical substraction of two numbers.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x - y \f$.
+		 *
+		 * \note This is the inverse to #grb::operators::add.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator- overloads
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class subtract : public internal::Operator<
+			internal::substract< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = subtract< A, B, C, D >;
+
+				subtract() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class divide : public internal::Operator< internal::divide< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = divide< A, B, C, D >;
-			divide() {}
+		/**
+		 * Numerical division of two numbers.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ x / y \f$.
+		 *
+		 * \note This is the inverse to #grb::operators::mul.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator/-functions
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class divide : public internal::Operator<
+				internal::divide< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = divide< A, B, C, D >;
+
+				divide() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class divide_reverse : public internal::Operator< internal::divide_reverse< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = divide_reverse< A, B, C, D >;
-			divide_reverse() {}
+		/**
+		 * Reversed division of two numbers.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y)\ \to\ y / x \f$.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator/-functions
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class divide_reverse : public internal::Operator<
+				internal::divide_reverse< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = divide_reverse< A, B, C, D >;
+
+				divide_reverse() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class equal : public internal::Operator< internal::equal< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = equal< A, B, C, D >;
-			equal() {}
+		/**
+		 * Operator which returns <tt>true</tt> if its inputs compare equal, and
+		 * <tt>false</tt> otherwise.
+		 *
+		 * \note This operator is the inverse of #grb::operators::not_equal.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator=-functions
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class equal : public internal::Operator<
+				internal::equal< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = equal< A, B, C, D >;
+
+				equal() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class not_equal : public internal::Operator< internal::not_equal< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = not_equal< A, B, C, D >;
-			not_equal() {}
+		/**
+		 * Operator that returns <tt>false</tt> whenever its inputs compare equal,
+		 * and <tt>true</tt> otherwise.
+		 *
+		 * \note This operator is the inverse of #grb::operators::equal.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator=-functions
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class not_equal : public internal::Operator<
+			internal::not_equal< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = not_equal< A, B, C, D >;
+
+				not_equal() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class any_or : public internal::Operator< internal::any_or< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = any_or< A, B, C, D >;
-			any_or() {}
+		/**
+		 * This operator is a generalisation of the logical or.
+		 *
+		 * It assigns to the output any input which evaluates <tt>true</tt>. If there
+		 * is no such input, it assigns any input that evaluates <tt>false</tt>.
+		 *
+		 * \note The main difference is that the output is never cast from a Boolean
+		 *       <tt>true</tt> or <tt>false</tt>.
+		 *
+		 * The input domains must be <em>castable</em> to <tt>bool</tt>.
+		 *
+		 * The input domains must furthermore be \em castable to the output domain.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class any_or : public internal::Operator<
+			internal::any_or< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = any_or< A, B, C, D >;
+
+				any_or() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class logical_or : public internal::Operator< internal::logical_or< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = logical_or< A, B, C, D >;
-			logical_or() {}
+		/**
+		 * The logical or.
+		 *
+		 * It returns <tt>true</tt> whenever any of its inputs evaluate <tt>true</tt>,
+		 * and returns <tt>false</tt> otherwise.
+		 *
+		 * If the output domain is not Boolean, then the returned value is
+		 * <tt>true</tt> or <tt>false</tt> cast to the output domain.
+		 *
+		 * \warning Thus both input domains and the output domain must be \em castable
+		 *          to <tt>bool</tt>.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class logical_or : public internal::Operator<
+				internal::logical_or< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = logical_or< A, B, C, D >;
+
+				logical_or() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class logical_and : public internal::Operator< internal::logical_and< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = logical_and< A, B, C, D >;
-			logical_and() {}
+		/**
+		 * The logical and.
+		 *
+		 * It returns <tt>true</tt> when both of its inputs evaluate <tt>true</tt>,
+		 * and returns <tt>false</tt> otherwise.
+		 *
+		 * If the output domain is not Boolean, then the returned value is
+		 * <tt>true</tt> or <tt>false</tt> cast to the output domain.
+		 *
+		 * \warning Thus both input domains and the output domain must be \em castable
+		 *          to <tt>bool</tt>.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class logical_and : public internal::Operator<
+				internal::logical_and< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = logical_and< A, B, C, D >;
+
+				logical_and() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class relu : public internal::Operator< internal::relu< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = relu< A, B, C, D >;
-			relu() {}
+		/**
+		 * This operation is equivalent to #grb::operators::min.
+		 *
+		 * It assumes that the right-hand input is the bias, while the left-hand
+		 * input is the signal.
+		 *
+		 * @see min
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class relu : public internal::Operator<
+				internal::relu< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = relu< A, B, C, D >;
+
+				relu() {}
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class abs_diff : public internal::Operator< internal::abs_diff< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = abs_diff< A, B, C, D >;
-			abs_diff() {}
+		/**
+		 * This operator returns the absolute difference between two numbers.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to |x-y| \f$.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator- and
+		 *          <tt>std::abs</tt> overloads available.
+		 *
+		 * @see square_diff
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class abs_diff : public internal::Operator<
+				internal::abs_diff< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = abs_diff< A, B, C, D >;
+
+				abs_diff() {}
+
 		};
 
-		/** TODO documentation. */
+		/**
+		 * The argmin operator on key-value pairs.
+		 *
+		 * @tparam IType The key type.
+		 * @tparam VType The value type.
+		 *
+		 * This operator is only defined for key-value pairs encapsulated in the
+		 * STL standard <tt>std::pair</tt>. The return type equals that of the
+		 * key type.
+		 *
+		 * This operator returns the key corresponding to the key-value pair whose
+		 * value evaluates less than the other.
+		 *
+		 * \warning If both values are equal, any key may be returned.
+		 *
+		 * @see argmax
+		 * @see equal_first
+		 */
 		template< typename IType, typename VType >
 		class argmin : public internal::Operator< internal::argmin< IType, VType > > {
-		public:
-			argmin() {}
+
+			public:
+
+				argmin() {}
+
 		};
 
-		/** TODO documentation. */
+		/**
+		 * The argmax operator on key-value pairs.
+		 *
+		 * @tparam IType The key type.
+		 * @tparam VType The value type.
+		 *
+		 * This operator is only defined for key-value pairs encapsulated in the
+		 * STL standard <tt>std::pair</tt>. The return type equals that of the
+		 * key type.
+		 *
+		 * This operator returns the key corresponding to the key-value pair whose
+		 * value evaluates greater than the other.
+		 *
+		 * \warning If both values are equal, any key may be returned.
+		 *
+		 * @see argmin
+		 * @see equal_first
+		 */
 		template< typename IType, typename VType >
 		class argmax : public internal::Operator< internal::argmax< IType, VType > > {
-		public:
-			argmax() {}
+
+			public:
+
+				argmax() {}
+
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
-		class square_diff : public internal::Operator< internal::square_diff< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = square_diff< A, B, C, D >;
-			square_diff() {}
+		/**
+		 * This operation returns the squared difference between two numbers.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to (x-y)^2 \f$.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator- and
+		 *          operator* overloads available.
+		 *
+		 * @see abs_diff
+		 */
+		template<
+			typename D1, typename D2, typename D3,
+			enum Backend implementation = config::default_backend
+		>
+		class square_diff : public internal::Operator<
+				internal::square_diff< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = square_diff< A, B, C, D >;
+
+				square_diff() {}
+
 		};
 
-		/** \todo add documentation */
-		template< typename IN1, typename IN2, enum Backend implementation = config::default_backend >
-		class zip : public internal::Operator< internal::zip< IN1, IN2, implementation > > {
-		public:
-			template< typename A, typename B, enum Backend D >
-			using GenericOperator = zip< A, B, D >;
-			zip() {}
+		/**
+		 * The zip operator that operators on keys as a left-hand input and values as
+		 * a right hand input, producing a key-value <tt>std::pair</tt>.
+		 *
+		 * @tparam IN1 The key type.
+		 * @tparam IN2 The value type.
+		 *
+		 * The output domain is fixed to <tt>std::pair< IN1, IN2 ></tt>.
+		 */
+		template<
+			typename IN1, typename IN2,
+			enum Backend implementation = config::default_backend
+		>
+		class zip : public internal::Operator<
+				internal::zip< IN1, IN2, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, enum Backend D >
+				using GenericOperator = zip< A, B, D >;
+
+				zip() {}
+
 		};
 
-		/** \todo add documentation */
-		template< typename D1, typename D2 = D1, typename D3 = D2, enum Backend implementation = config::default_backend >
-		class equal_first : public internal::Operator< internal::equal_first< D1, D2, D3, implementation > > {
-		public:
-			template< typename A, typename B, typename C, enum Backend D >
-			using GenericOperator = equal_first< A, B, C, D >;
-			equal_first() {}
+		/**
+		 * Compares <tt>std::pair</tt> inputs taking the first entry in every pair
+		 * as the comparison key, and returns <tt>true</tt> or <tt>false</tt>
+		 * accordingly.
+		 *
+		 * The input domains must both be <tt>std::pair</tt>.
+		 *
+		 * \note If the output type is not Boolean, the output is cast from Boolean
+		 *       to the output domain.
+		 *
+		 * The output domain must hence be \em castable from <tt>bool</tt>.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class equal_first : public internal::Operator<
+				internal::equal_first< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = equal_first< A, B, C, D >;
+
+				equal_first() {}
+
+		};
+
+		/**
+		 * This operation returns whether the left operand compares less-than the
+		 * right operand.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to x < y \f$.
+		 *
+		 * The result is cast from <tt>bool</tt> to \a D3.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator< overload
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class less_than : public internal::Operator<
+				internal::lt< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = less_than< A, B, C, D >;
+
+				less_than() {}
+
+		};
+
+		/**
+		 * This operation returns whether the left operand compares less-than or equal
+		 * to the right operand.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to x \leq y \f$.
+		 *
+		 * The result is cast from <tt>bool</tt> to \a D3.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator<= overload
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class leq : public internal::Operator<
+				internal::leq< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = leq< A, B, C, D >;
+
+				leq() {}
+
+		};
+
+		/**
+		 * This operation returns whether the left operand compares greater-than the
+		 * right operand.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to x > y \f$.
+		 *
+		 * The result is cast from <tt>bool</tt> to \a D3.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator> overload
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class greater_than: public internal::Operator<
+				internal::gt< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = greater_than< A, B, C, D >;
+
+				greater_than() {}
+
+		};
+
+		/**
+		 * This operation returns whether the left operand compares greater-than or
+		 * equal to the right operand.
+		 *
+		 * Mathematical notation: \f$ \odot(x,y) \to x \geq y \f$.
+		 *
+		 * The result is cast from <tt>bool</tt> to \a D3.
+		 *
+		 * \warning This operator expects numerical types for \a D1, \a D2, and
+		 *          \a D3, or types that have the appropriate operator>= overload
+		 *          available.
+		 */
+		template<
+			typename D1, typename D2 = D1, typename D3 = D2,
+			enum Backend implementation = config::default_backend
+		>
+		class geq : public internal::Operator<
+				internal::geq< D1, D2, D3, implementation >
+		> {
+
+			public:
+
+				template< typename A, typename B, typename C, enum Backend D >
+				using GenericOperator = geq< A, B, C, D >;
+
+				geq() {}
+
 		};
 
 	} // namespace operators
@@ -467,64 +923,90 @@ namespace grb {
 		static const constexpr bool value = true;
 	};
 
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::less_than< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::leq< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::greater_than< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
+	template< typename D1, typename D2, typename D3, enum Backend implementation >
+	struct is_operator< operators::geq< D1, D2, D3, implementation > > {
+		static const constexpr bool value = true;
+	};
+
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::min< D1, D2, D3 > > {
+	struct is_idempotent< operators::min< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::max< D1, D2, D3 > > {
+	struct is_idempotent< operators::max< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::any_or< D1, D2, D3 > > {
+	struct is_idempotent< operators::any_or< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::logical_or< D1, D2, D3 > > {
+	struct is_idempotent< operators::logical_or< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::logical_and< D1, D2, D3 > > {
+	struct is_idempotent< operators::logical_and< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::relu< D1, D2, D3 > > {
+	struct is_idempotent< operators::relu< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::left_assign_if< D1, D2, D3 > > {
+	struct is_idempotent< operators::left_assign_if< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename D1, typename D2, typename D3 >
-	struct is_idempotent< operators::right_assign_if< D1, D2, D3 > > {
+	struct is_idempotent< operators::right_assign_if< D1, D2, D3 >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename IType, typename VType >
-	struct is_idempotent< operators::argmin< IType, VType > > {
+	struct is_idempotent< operators::argmin< IType, VType >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename IType, typename VType >
-	struct is_idempotent< operators::argmax< IType, VType > > {
+	struct is_idempotent< operators::argmax< IType, VType >, void > {
 		static const constexpr bool value = true;
 	};
 
 	template< typename OP >
-	struct is_associative {
-		static constexpr const bool value = is_operator< OP >::value && OP::is_associative();
+	struct is_associative<
+		OP,
+		typename std::enable_if< is_operator< OP >::value, void >::type
+	> {
+		static constexpr const bool value = OP::is_associative();
 	};
 
 	template< typename OP >
-	struct is_commutative {
-		static constexpr const bool value = is_operator< OP >::value && OP::is_commutative();
+	struct is_commutative<
+		OP,
+		typename std::enable_if< is_operator< OP >::value, void >::type
+	> {
+		static constexpr const bool value = OP::is_commutative();
 	};
 
 	// internal type traits follow
diff --git a/include/graphblas/phase.hpp b/include/graphblas/phase.hpp
index e4febc12c..f222ac98f 100644
--- a/include/graphblas/phase.hpp
+++ b/include/graphblas/phase.hpp
@@ -15,13 +15,18 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Defines the various phases an ALP/GraphBLAS primitive may be executed with.
+ *
  * @author A. N. Yzelman
  */
 
 #ifndef _H_GRB_PHASE
 #define _H_GRB_PHASE
 
+
 namespace grb {
 
 	/**
@@ -48,7 +53,7 @@ namespace grb {
 	 *       phases.
 	 *    2. especially for level-1 and level-2 primitives, it may also be that
 	 *       single-phase approaches are feasible. Hence ALP/GraphBLAS defines that
-	 *       the execute phase, #grb::Phase::EXECUTE, is the default when calling
+	 *       the execute phase, #grb::EXECUTE, is the default when calling
 	 *       an ALP/GraphBLAS primitive without an explicit phase argument.
 	 *    3. sometimes speculative execution is warranted; these apply to
 	 *       situations where
@@ -58,11 +63,11 @@ namespace grb {
 	 *
 	 * To cater to a wide range of approaches and use cases, we support the
 	 * following three phases:
-	 *    1. #grb::Phase::RESIZE, which resizes capacities based on the requested
+	 *    1. #grb::RESIZE, which resizes capacities based on the requested
 	 *       operation;
-	 *    2. #grb::Phase::EXECUTE, which attempts to execute the computation
+	 *    2. #grb::EXECUTE, which attempts to execute the computation
 	 *       assuming the capacity is sufficient;
-	 *    3. #grb::Phase::TRY, which attempts to execute the computation, and
+	 *    3. #grb::TRY, which attempts to execute the computation, and
 	 *       does not mind if the capacity turns out to be insufficient.
 	 *
 	 * Backends must give precise performance semantics to primitives executing in
@@ -81,7 +86,7 @@ namespace grb {
 	 *      following call to f( A, ..., EXECUTE ) is successful;
 	 *   3. a call to f( A, ..., TRY ), which may or may not succeed. If the call
 	 *      does not succeed, then \a A, after function exit:
-	 *        -# contains exactly #grb::capacity( A ) nonzeroes;
+	 *        -# contains exactly #grb::capacity (of \a A) nonzeroes;
 	 *        -# has nonzeroes at the coordinates where \a A on entry had
 	 *           nonzeroes;
 	 *        -# has nonzeroes with values equal to those that would have been
@@ -118,8 +123,7 @@ namespace grb {
 	 * resize( B, nnz( A ) );
 	 * set( B, A );
 	 * if( f( A, ..., EXECUTE ) == FAILED ) {
-	 *     f( B, ..., INSPECT );
-	 *     f( B, ..., EXECUTE );
+	 *     f( B, ..., RESIZE );
 	 *     std::swap( A, B );
 	 * }
 	 * \endcode
@@ -127,7 +131,7 @@ namespace grb {
 	 * \code
 	 * resize( B, nnz( A ) );
 	 * set( B, A );
-	 * while( f, A, ..., EXECUTE ) == FAILED ) {
+	 * while( f( A, ..., EXECUTE ) == FAILED ) {
 	 *     resize( A, capacity( A ) + 1 );
 	 *     set( A, B );
 	 * }
@@ -136,13 +140,13 @@ namespace grb {
 	 * \note If the matrix \a A is empty on entry, then the latter two code
 	 *       snippets do not require the use \a B as a temporary buffer.
 	 *
-	 * \note Since #grb::Phase::EXECUTE is the default phase, any occurrance of
+	 * \note Since #grb::EXECUTE is the default phase, any occurrance of
 	 *       <code>f( A, ..., EXECUTE )</code> may be replaced with
 	 *       <code>f( A, ... )</code>.
 	 *
 	 * The above code snippets do not include try phases since whenever output
 	 * containers do not have enough capacity, primitives executed using
-	 * #grb::Phase::TRY will \em not generate equivalent results.
+	 * #grb::TRY will \em not generate equivalent results.
 	 *
 	 */
 	enum Phase {
@@ -154,7 +158,7 @@ namespace grb {
 		 * attempts to both estimate and resize the output container(s).
 		 *
 		 * A successful call using this phase guarantees that a subsequent and
-		 * equivalent call using the #grb::Phase::EXECUTE phase shall be successful.
+		 * equivalent call using the #grb::EXECUTE phase shall be successful.
 		 *
 		 * Here, an <em>equivalent call</em> means that the operation must be called
 		 * with exactly the same arguments, except for the #grb::Phase argument.
@@ -193,7 +197,7 @@ namespace grb {
 		 * computation is incomplete and the primitive shall return #grb::FAILED.
 		 * Regarding each output container \a A, the following are guaranteed:
 		 *    -# the capacity of \a A remains unchanged;
-		 *    -# contains #grb::capacity( A ) nonzeroes;
+		 *    -# contains #grb::capacity (of \a A) nonzeroes;
 		 *    -# has nonzeroes at the coordinates where \a A on entry had nonzeroes;
 		 *    -# has nonzeroes with values equal to those that would have been
 		 *       computed at its coordinates were the call successul; and
@@ -207,7 +211,7 @@ namespace grb {
 		 *          words, this mechanism does not allow for the partial computation
 		 *          to complete the remainder computation using less effort than the
 		 *          full computation would have required. This is the main difference
-		 *          with the #grb::Phase::EXECUTE phase.
+		 *          with the #grb::EXECUTE phase.
 		 *
 		 * \note This phase is particularly useful if partial output is still usable
 		 *       and recomputation to generate the full output is not required.
@@ -235,7 +239,7 @@ namespace grb {
 		 *
 		 * \note That on failure a primitive called using the execute phase may
 		 *       destroy any pre-existing contents of output containers is a critical
-		 *       difference with the #grb::Phase::TRY phase.
+		 *       difference with the #grb::TRY phase.
 		 *
 		 * \warning When calling ALP/GraphBLAS primitives without specifying a phase
 		 *          explicitly, this execute phase will be assumed by default.
diff --git a/include/graphblas/pinnedvector.hpp b/include/graphblas/pinnedvector.hpp
index 8e24582d2..380c53ae7 100644
--- a/include/graphblas/pinnedvector.hpp
+++ b/include/graphblas/pinnedvector.hpp
@@ -32,13 +32,19 @@
 
 // now include all specialisations contained in the backend directories:
 #ifdef _GRB_WITH_REFERENCE
-#include <graphblas/reference/pinnedvector.hpp>
+ #include <graphblas/reference/pinnedvector.hpp>
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/pinnedvector.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/pinnedvector.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include <graphblas/bsp1d/pinnedvector.hpp>
+ #include <graphblas/bsp1d/pinnedvector.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include <graphblas/banshee/pinnedvector.hpp>
+ #include <graphblas/banshee/pinnedvector.hpp>
 #endif
 
 // specify default only if requested during compilation
@@ -50,3 +56,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_PINNEDVECTOR''
+
diff --git a/include/graphblas/properties.hpp b/include/graphblas/properties.hpp
index 1349e637e..864b849cd 100644
--- a/include/graphblas/properties.hpp
+++ b/include/graphblas/properties.hpp
@@ -28,13 +28,19 @@
 
 // now include all specialisations contained in the backend directories:
 #ifdef _GRB_WITH_REFERENCE
-#include <graphblas/reference/properties.hpp>
+ #include <graphblas/reference/properties.hpp>
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/properties.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/properties.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include <graphblas/bsp1d/properties.hpp>
+ #include <graphblas/bsp1d/properties.hpp>
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include <graphblas/banshee/properties.hpp>
+ #include <graphblas/banshee/properties.hpp>
 #endif
 
 // specify default only if requested during compilation
@@ -46,3 +52,4 @@ namespace grb {
 #endif
 
 #endif // end ``_H_GRB_PROPERTIES''
+
diff --git a/include/graphblas/rc.hpp b/include/graphblas/rc.hpp
index 53f4d974c..31362b9d6 100644
--- a/include/graphblas/rc.hpp
+++ b/include/graphblas/rc.hpp
@@ -18,7 +18,7 @@
 /**
  * @file
  *
- * Defines all possible GraphBLAS error codes.
+ * Defines the ALP error codes.
  *
  * @author A. N. Yzelman
  * @date 9--11 August, 2016
@@ -29,46 +29,46 @@
 
 #include <string>
 
+
 namespace grb {
 
 	/**
-	 * Return codes of public functions.
+	 * Return codes of ALP primitives.
+	 *
+	 * All primitives that are not \em getters return one of the codes defined
+	 * here. All primitives may return #SUCCESS, and all primitives may return
+	 * #PANIC. All other error codes are optional-- please see the description of
+	 * each primitive which other error codes may be valid.
+	 *
+	 * For core ALP primitives, any non-SUCCESS and non-PANIC error code shall have
+	 * no side effects; if a call fails, it shall be as though the call was never
+	 * made.
 	 */
 	enum RC {
 
 		/**
-		 * Default success code.
+		 * Indicates the primitive has executed successfully.
 		 *
-		 * All GraphBLAS functions may return this error code even if not explicitly
-		 * documented. Any non-SUCCESS error code shall have no side effects; if a
-		 * call fails, it shall be as though the call was never made. The only
-		 * exception is #grb::PANIC.
+		 * All primitives may return this error code.
 		 */
 		SUCCESS = 0,
 
 		/**
-		 * Generic fatal error code.
-		 *
-		 * Signals an illegal state of all GraphBLAS objects connected to the call
-		 * returning this error. Users can only exit gracefully when encoutering
-		 * errors of this type-- after a GraphBLAS function returns this error
-		 * code, the state of the library becomes undefined.
+		 * Generic fatal error code. Signals that ALP has entered an undefined state.
 		 *
-		 * An implementation is encouraged to write clear error messages to stderr
-		 * prior to returning this error code.
+		 * Users can only do their best to exit their application gracefully once
+		 * PANIC has been encountered.
 		 *
-		 * Rationale: instead of using <tt>assert</tt> within GraphBLAS
-		 * implementations which would crash the entire application, implementations
-		 * should instead simply return #grb::PANIC and let the GraphBLAS user shut
-		 * down his or her application as gracefully as possible.
+		 * An implementation (backend) is encouraged to write clear error messages to
+		 * stderr prior to returning this error code.
 		 *
-		 * All GraphBLAS functions may return this error code even if not explicitly
+		 * All primitives may return this error code even if not explicitly
 		 * documented.
 		 */
 		PANIC,
 
 		/**
-		 * Out of memory error code.
+		 * Signals an out-of-memory error while executing the requested primitive.
 		 *
 		 * User can mitigate by freeing memory and retrying the call or by reducing
 		 * the amount of memory required by this call.
@@ -78,12 +78,12 @@ namespace grb {
 		OUTOFMEM,
 
 		/**
-		 * One or more of the GraphBLAS objects corresponding to the call returning
-		 * this error have mismatching dimensions.
+		 * One or more of the ALP/GraphBLAS objects passed to the primitive that
+		 * returned this error have mismatching dimensions.
 		 *
 		 * User can mitigate by reissuing with correct parameters. It is usually not
-		 * possible to mitigate at run-time; usually this signals a logic programming
-		 * error.
+		 * possible to mitigate at run-time; more often than not, this error signals
+		 * a logical programming error.
 		 *
 		 * This error code may only be returned when explicitly documented as such.
 		 */
@@ -91,52 +91,65 @@ namespace grb {
 
 		/**
 		 * One or more of the GraphBLAS objects corresponding to the call returning
-		 * this error refer to the same object while this is forbidden.
+		 * this error refer to the same object while this explicitly is forbidden.
+		 *
+		 * \deprecated This error code will be replaced with #ILLEGAL.
 		 *
 		 * User can mitigate by reissuing with correct parameters. It is usually not
-		 * possible to mitigate at run-time; usually this signals a logic programming
-		 * error. Implementations are not required to return this error code and may
-		 * incur undefined behaviour instead.
+		 * possible to mitigate at run-time; more often than not, this error signals
+		 * a logical programming error.
 		 *
-		 * This error code may only be returned when explicitly documented as such.
+		 * This error code may only be returned when explicitly documented as such,
+		 * but note the deprecation message-- any uses of #OVERLAP will be replaced
+		 * with #ILLEGAL before v1.0 is released.
 		 */
 		OVERLAP,
 
 		/**
-		 * One or more output parameters would overflow on this function call.
+		 * Indicates that execution of the requested primitive with the given
+		 * arguments would result in overflow.
 		 *
-		 * Users can mitigate by supplying a larger integral types.
+		 * Users can mitigate by modifying the offending call. It is usually not
+		 * possible to mitigate at run-time; more often than not, this error signals
+		 * the underlying problem is too large to handle with whatever current
+		 * resources have been assigned to ALP.
 		 *
 		 * This error code may only be returned when explicitly documented as such.
 		 */
 		OVERFLW,
 
 		/**
-		 * A bsp::init() assuming multiple user processes while this is not supported
-		 * by the chosen implementation backend will reduce this error code.
+		 * Indicates that the execution of the requested primitive with the given
+		 * arguments is not supported by the selected backend.
 		 *
-		 * @see config::default_backend for a description of how the current backend
-		 *                              is selected (if not explicitly).
+		 * This error code should never be returned by a fully compliant backend.
 		 *
-		 * This error code may only be returned when explicitly documented as such.
+		 * If encountered, the end-user may mitigate by selecting a different backend.
 		 */
 		UNSUPPORTED,
 
 		/**
-		 * A call to a GraphBLAS function with an illegal parameter value might
-		 * return this error code. When returned, no undefined behaviour will occur
-		 * as a result of having passed the illegal argument.
+		 * A call to a primitive has determined that one of its arguments was
+		 * illegal as per the specification of the primitive.
 		 *
-		 * This error code may only be returned when explicitly documented as such.
+		 * User can mitigate by reissuing with correct parameters. It is usually not
+		 * possible to mitigate at run-time; more often than not, this error signals
+		 * a logical programming error.
+		 *
+		 * This error code may only be returned when explicitly documented as such;
+		 * in other words, the specification precisely determines which (combinations
+		 * of) inputs are illegal.
 		 */
 		ILLEGAL,
 
 		/**
-		 * Indicates when one of the grb::algorithms has failed to achieve its
+		 * Indicates when one of the #grb::algorithms has failed to achieve its
 		 * intended result, for instance, when an iterative method failed to
 		 * converged within its alloted resources.
 		 *
-		 * This error code may only be returned when explicitly documented as such.
+		 * This error code may only be returned when explicitly documented as such,
+		 * and may never be returned by core ALP primitives-- it is reserved for
+		 * use by algorithms only.
 		 */
 		FAILED
 
@@ -148,3 +161,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/reference/benchmark.hpp b/include/graphblas/reference/benchmark.hpp
index 8008ea916..226500ecf 100644
--- a/include/graphblas/reference/benchmark.hpp
+++ b/include/graphblas/reference/benchmark.hpp
@@ -30,29 +30,44 @@
 
 namespace grb {
 
+	/**
+	 * \internal
+	 * Implementation inherits from #grb::internal::BenchmarkerBase and
+	 * #grb::Launcher (reference).
+	 * \endinternal
+	 */
 	template< enum EXEC_MODE mode >
 	class Benchmarker< mode, reference > :
 		protected Launcher< mode, reference >, protected internal::BenchmarkerBase
 	{
 
 		public:
-			Benchmarker( size_t process_id = 0,         // user process ID
-				size_t nprocs = 1,                  // total number of user processes
+
+			/** \internal Delegates to #grb::Launcher (reference) constructor. */
+			Benchmarker(
+				const size_t process_id = 0,        // user process ID
+				const size_t nprocs = 1,            // total number of user processes
 				std::string hostname = "localhost", // one of the user process hostnames
 				std::string port = "0"              // a free port at hostname
-				) :
-				Launcher< mode, reference >( process_id, nprocs, hostname, port ) {}
+			) : Launcher< mode, reference >( process_id, nprocs, hostname, port ) {}
 
+			/** \internal No implementation notes. */
 			template< typename U >
-			RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
 				const void * data_in, const size_t in_size,
 				U &data_out,
 				const size_t inner, const size_t outer,
 				const bool broadcast = false
 			) const {
-				(void)broadcast; // value doesn't matter for a single user process
+				(void) broadcast; // value doesn't matter for a single user process
+				// catch illegal argument
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
 				// initialise GraphBLAS
 				RC ret = grb::init();
+
 				// call graphBLAS algo
 				if( ret == SUCCESS ) {
 					benchmark< U >( grb_program, data_in, in_size, data_out, inner, outer, 0 );
@@ -66,15 +81,16 @@ namespace grb {
 				return ret;
 			}
 
-			/** No implementation notes. */
+			/** \internal No implementation notes. */
 			template< typename T, typename U >
-			RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-				const T & data_in, U &data_out, // input & output data
+			RC exec(
+				void ( *grb_program )( const T &, U & ), // user GraphBLAS program
+				const T &data_in, U &data_out, // input & output data
 				const size_t inner,
 				const size_t outer,
 				const bool broadcast = false
 			) {
-				(void)broadcast; // value doesn't matter for a single user process
+				(void) broadcast; // value doesn't matter for a single user process
 				// initialise GraphBLAS
 				RC ret = grb::init();
 				// call graphBLAS algo
diff --git a/include/graphblas/reference/blas1.hpp b/include/graphblas/reference/blas1.hpp
index df14f6545..757965d46 100644
--- a/include/graphblas/reference/blas1.hpp
+++ b/include/graphblas/reference/blas1.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides the level-1 primitives for the reference backend
+ *
  * @author A. N. Yzelman
  * @date 5th of December 2016
  */
@@ -94,261 +98,641 @@ namespace grb {
 
 	namespace internal {
 
+		/**
+		 * Folds a vector into a scalar assuming the vector is dense.
+		 */
+		template< bool left, class Monoid, typename InputType, class Coords >
+		RC fold_from_vector_to_scalar_dense(
+			typename Monoid::D3 &global,
+			const Vector< InputType, reference, Coords > &to_fold,
+			const Monoid &monoid
+		) {
+			const InputType *__restrict__ const raw = internal::getRaw( to_fold );
+			const size_t n = internal::getCoordinates( to_fold ).nonzeroes();
+			assert( n == internal::getCoordinates( to_fold ).size() );
+			assert( n > 0 );
+			RC ret = SUCCESS;
+			size_t global_start, global_end;
+			if( left ) {
+				global = raw[ 0 ];
+				global_start = 1;
+				global_end = n;
+			} else {
+				global = raw[ n - 1 ];
+				global_start = 0;
+				global_end = n - 1;
+			}
+
+			// catch trivial case
+			if( global_start >= global_end ) {
+				return SUCCESS;
+			}
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+			#pragma omp parallel
+			{
+				size_t start, end;
+				config::OMP::localRange( start, end, global_start, global_end );
+#else
+				const size_t start = global_start;
+				const size_t end = global_end;
+#endif
+				if( start < end ) {
+					typename Monoid::D3 local =
+						monoid.template getIdentity< typename Monoid::D3 >();
+					if( left ) {
+						monoid.getOperator().foldlArray( local, raw + start, end - start );
+					} else {
+						monoid.getOperator().foldrArray( raw + start, local, end - start );
+					}
+					RC local_rc = SUCCESS;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+					#pragma omp critical
+					{
+#endif
+#ifdef _DEBUG
+						std::cout << "\t\t folding " << local << " into " << global << "\n";
+#endif
+						if( left ) {
+							local_rc = foldl( global, local, monoid.getOperator() );
+						} else {
+							local_rc = foldr( local, global, monoid.getOperator() );
+						}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+					}
+#endif
+					if( local_rc != SUCCESS ) {
+						ret = local_rc;
+					}
+				}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+			}
+#endif
+			return ret;
+		}
+
+		/**
+		 * Folds a vector into a scalar.
+		 *
+		 * May be masked, and the vector is assumed sparse.
+		 *
+		 * This variant is driven by the sparsity pattern of the vector.
+		 */
 		template<
-			Descriptor descr = descriptors::no_operation,
-			bool masked, bool left, // if this is false, assumes right-looking fold
-			class OP,
-			typename IOType, typename InputType, typename MaskType,
-			typename Coords
+			Descriptor descr,
+			bool masked, bool left, class Monoid,
+			typename InputType, typename MaskType,
+			class Coords
 		>
-		RC fold_from_vector_to_scalar_generic(
-			IOType &fold_into,
+		RC fold_from_vector_to_scalar_vectorDriven(
+			typename Monoid::D3 &global,
 			const Vector< InputType, reference, Coords > &to_fold,
 			const Vector< MaskType, reference, Coords > &mask,
-			const OP &op
+			const Monoid &monoid
 		) {
-			// static sanity checks
-			static_assert( grb::is_associative< OP >::value,
-				"grb::foldl can only be called on associate operators. This "
-				"function should not have been called-- please submit a "
-				"bugreport." );
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t nz = internal::getCoordinates( to_fold ).nonzeroes();
+#ifdef NDEBUG
+			(void) n;
+#endif
 
+			assert( n > 0 );
+			assert( nz > 0 );
+			assert( !masked || internal::getCoordinates( mask ).size() == n );
 
-			// mask must be of equal size as input vector
-			if( masked && size( to_fold ) != size( mask ) ) {
-				return MISMATCH;
-			}
+			RC ret = SUCCESS;
 
-			// fold is only defined on dense vectors
-			if( nnz( to_fold ) < size( to_fold ) ) {
-				return ILLEGAL;
-			}
+			// compute in parallel
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+			#pragma omp parallel
+			{
+				size_t start, end;
+				config::OMP::localRange( start, end, 0, nz );
+#else
+				const size_t start = 0;
+				const size_t end = nz;
+#endif
+				// compute thread-local partial reduction
+				typename Monoid::D3 local =
+					monoid.template getIdentity< typename Monoid::D3 >();
+				for( size_t k = start; k < end; ++k ) {
+					const size_t i = internal::getCoordinates( to_fold ).index( k );
+					if( masked && !utils::interpretMask< descr >(
+							internal::getCoordinates( mask ).assigned( i ),
+							internal::getRaw( mask ),
+							i
+						)
+					) {
+						continue;
+					}
+					RC local_rc;
+					if( left ) {
+						local_rc = foldl< descr >(
+								local, internal::getRaw( to_fold )[ i ],
+								monoid.getOperator()
+							);
+					} else {
+						local_rc = foldr< descr >(
+								internal::getRaw( to_fold )[ i ], local,
+								monoid.getOperator()
+							);
+					}
+					assert( local_rc == SUCCESS );
+					if( local_rc != SUCCESS ) {
+						ret = local_rc;
+					}
+				}
 
-			// if dense descriptor is given, then additionally the mask must be dense
-			if( masked && (descr & descriptors::dense) && nnz( mask ) < size( mask ) ) {
-				return ILLEGAL;
-			}
+				// fold into global
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+				#pragma omp critical
+				{
+#endif
+					if( ret == SUCCESS && start < end ) {
+						if( left ) {
+							ret = foldl< descr >( global, local, monoid.getOperator() );
+						} else {
+							ret = foldr< descr >( local, global, monoid.getOperator() );
+						}
+						assert( ret == SUCCESS );
+					}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+				}
 
-			// handle trivial cases
-			if( size( to_fold ) == 0 ) {
-				return SUCCESS;
-			}
-			if( masked && nnz( mask ) == 0 && !(descr & descriptors::invert_mask) ) {
-				return SUCCESS;
-			}
+			} // end pragma omp parallel
+#endif
 
-			// some globals used during the folding
-			RC ret = SUCCESS;         // final return code
-			IOType global = IOType(); // global variable in which to fold
-			size_t root = 1;          // which process is the root of the fold (in
-			                          // case we have multiple processes)
-#ifndef _H_GRB_REFERENCE_OMP_BLAS1
-			// handle trivial sequential cases
-			if( !masked ) {
-				// this op is only defined on dense vectors, check this is the case
-				assert( internal::getCoordinates( to_fold ).nonzeroes() ==
-					internal::getCoordinates( to_fold ).size() );
-				// no mask, vectors are dense, sequential execution-- so rely on underlying
-				// operator
-				if( left ) {
-					global = internal::getRaw( to_fold )[ 0 ];
-					op.foldlArray( global, internal::getRaw( to_fold ) + 1,
-						internal::getCoordinates( to_fold ).size() - 1 );
-				} else {
-					global = internal::getRaw( to_fold )[
-						internal::getCoordinates( to_fold ).size() - 1
-					];
-					op.foldrArray( internal::getRaw( to_fold ), global,
-						internal::getCoordinates( to_fold ).size() - 1 );
+			// done
+			return ret;
+		}
+
+		/**
+		 * Folds a vector into a scalar.
+		 *
+		 * Must be masked, and both mask and vector are assumed sparse.
+		 *
+		 * This variant is driven by the sparsity pattern of the mask.
+		 */
+		template<
+			Descriptor descr,
+			bool left, class Monoid,
+			typename InputType, typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_maskDriven(
+			typename Monoid::D3 &global,
+			const Vector< InputType, reference, Coords > &to_fold,
+			const Vector< MaskType, reference, Coords > &mask,
+			const Monoid &monoid
+		) {
+			const size_t n = internal::getCoordinates( to_fold ).size();
+			const size_t nz = internal::getCoordinates( mask ).nonzeroes();
+
+			assert( internal::getCoordinates( mask ).size() == n );
+			assert( n > 0 );
+			assert( nz > 0 );
+#ifdef NDEBUG
+			(void) n;
+#endif
+
+			RC ret = SUCCESS;
+
+			// compute in parallel
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+			#pragma omp parallel
+			{
+				size_t start, end;
+				config::OMP::localRange( start, end, 0, nz );
+#else
+				const size_t start = 0;
+				const size_t end = nz;
+#endif
+				// compute thread-local partial reduction
+				typename Monoid::D3 local =
+					monoid.template getIdentity< typename Monoid::D3 >();
+				for( size_t k = start; k < end; ++k ) {
+					const size_t i = internal::getCoordinates( mask ).index( k );
+					if( !internal::getCoordinates( to_fold ).assigned( i ) ) {
+						continue;
+					}
+					if( !utils::interpretMask< descr >( true, internal::getRaw( mask ), i ) ) {
+						continue;
+					}
+					RC local_rc;
+					if( left ) {
+						local_rc = foldl< descr >(
+								local, internal::getRaw( to_fold )[ i ],
+								monoid.getOperator()
+							);
+					} else {
+						local_rc = foldr< descr >(
+								internal::getRaw( to_fold )[ i ], local,
+								monoid.getOperator()
+							);
+					}
+					assert( local_rc == SUCCESS );
+					if( local_rc != SUCCESS ) {
+						ret = local_rc;
+					}
 				}
-			} else {
+
+				// fold into global
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+				#pragma omp critical
+				{
+#endif
+					if( ret == SUCCESS && start < end ) {
+						if( left ) {
+							ret = foldl< descr >( global, local, monoid.getOperator() );
+						} else {
+							ret = foldr< descr >( local, global, monoid.getOperator() );
+						}
+						assert( ret == SUCCESS );
+					}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+				}
+
+			} // end pragma omp parallel
+#endif
+
+			// done
+			return ret;
+		}
+
+		/**
+		 * Folds a vector into a scalar.
+		 *
+		 * May be masked, and the vector may be sparse.
+		 *
+		 * This variant uses an O(n) loop, where n is the size of the vector.
+		 */
+		template<
+			Descriptor descr,
+			bool masked, bool left, class Monoid,
+			typename InputType, typename MaskType,
+			class Coords
+		>
+		RC fold_from_vector_to_scalar_fullLoopSparse(
+			typename Monoid::D3 &global,
+			const Vector< InputType, reference, Coords > &to_fold,
+			const Vector< MaskType, reference, Coords > &mask,
+			const Monoid &monoid
+		) {
+#ifdef _DEBUG
+			std::cout << "Entered fold_from_vector_to_scalar_fullLoopSparse\n";
+#endif
+			const auto &to_fold_coors = internal::getCoordinates( to_fold );
+			const size_t n = to_fold_coors.size();
+			assert( n > 0 );
+			RC ret = SUCCESS;
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+			#pragma omp parallel
+			{
+				// parallel case (masked & unmasked)
+				size_t i, end;
+				config::OMP::localRange( i, end, 0, n );
+#else
 				// masked sequential case
-				const size_t n = internal::getCoordinates( to_fold ).size();
-				constexpr size_t s = 0;
-				constexpr size_t P = 1;
 				size_t i = 0;
 				const size_t end = n;
-#else
-			{
-				#pragma omp parallel
-				{
-					// parallel case (masked & unmasked)
-					const size_t n = internal::getCoordinates( to_fold ).size();
-					const size_t s = omp_get_thread_num();
-					const size_t P = omp_get_num_threads();
-					assert( s < P );
-					const size_t blocksize = n / P + ( ( n % P ) > 0 ? 1 : 0 );
-					size_t i = s * blocksize > n ? n : s * blocksize;
-					const size_t end = ( s + 1 ) * blocksize > n ? n : ( s + 1 ) * blocksize;
-
-					#pragma omp single
-					{ root = P; }
-					#pragma omp barrier
-#endif
-					// some sanity checks
-					assert( i <= end );
-					assert( end <= n );
-#ifdef NDEBUG
-					(void) n;
 #endif
-					// assume current i needs to be processed
-					bool process_current_i = true;
-					// i is at relative position -1. We keep forwarding until we find an index
-					// we should process (or until we hit the end of our block)
-					if( masked && i < end ) {
-
-						// check if we need to process current i
+				// some sanity checks
+				assert( i <= end );
+				assert( end <= n );
+
+				// assume current i needs to be processed, forward until we find an index
+				// for which the mask evaluates true
+				bool process_current_i = true;
+				if( masked && i < end ) {
+					process_current_i = utils::interpretMask< descr >(
+						internal::getCoordinates( mask ).assigned( i ),
+						internal::getRaw( mask ),
+						i
+					) && to_fold_coors.assigned( i );
+					// if not
+					while( !process_current_i ) {
+						// forward to next element
+						(void) ++i;
+						// check that we are within bounds
+						if( i == end ) {
+							break;
+						}
+						// evaluate whether we should process this i-th element
 						process_current_i = utils::interpretMask< descr >(
 							internal::getCoordinates( mask ).assigned( i ),
 							internal::getRaw( mask ),
 							i
-						);
-						// if not
-						while( !process_current_i ) {
-							// forward to next element
-							(void) ++i;
-							// check that we are within bounds
-							if( i == end ) {
-								break;
-							}
-							// evaluate whether we should process this i-th element
-							process_current_i = utils::interpretMask< descr >(
-								internal::getCoordinates( mask ).assigned( i ),
-								internal::getRaw( mask ),
-								i
-							);
+						) && to_fold_coors.assigned( i );
+					}
+				}
+				if( !masked && i < end ) {
+					process_current_i = to_fold_coors.assigned( i );
+					while( !process_current_i ) {
+						(void) ++i;
+						if( i == end ) {
+							break;
 						}
+						process_current_i = to_fold_coors.assigned( i );
 					}
+				}
 
-					// whether we have any nonzeroes assigned at all
-					const bool empty = i >= end;
-
+				// whether we have any nonzeroes assigned at all
+				const bool empty = i >= end;
 #ifndef _H_GRB_REFERENCE_OMP_BLAS1
-					// in the sequential case, the empty case should have been handled earlier
-					assert( !empty );
-#else
-					// select root
-					#pragma omp critical
+				(void) empty;
 #endif
-					{
-						// check if we have a root already
-						if( !empty && root == P ) {
-							// no, so take it
-							root = s;
-						}
-					}
-					// declare thread-local variable and set our variable to the first value in our block
+
 #ifndef NDEBUG
+				if( i < end ) {
+					assert( i < n );
+				}
+#endif
+
+				// declare thread-local variable and set our variable to the first value in our block
+				typename Monoid::D3 local =
+					monoid.template getIdentity< typename Monoid::D3 >();
+				if( end > 0 ) {
 					if( i < end ) {
-						assert( i < n );
-					}
+#ifdef _DEBUG
+						std::cout << "\t processing start index " << i << "\n";
 #endif
 
-					GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the below code ensures to set local
-					IOType local;                       // whenever our local block is
-					GRB_UTIL_RESTORE_WARNINGS           // non-empty
-					if( end > 0 ) {
-						if( i < end ) {
-							local = static_cast< IOType >( internal::getRaw( to_fold )[ i ] );
-						} else {
-							local = static_cast< IOType >( internal::getRaw( to_fold )[ 0 ] );
-						}
+						local = static_cast< typename Monoid::D3 >(
+								internal::getRaw( to_fold )[ i ]
+							);
 					}
+				}
 
-					// if we have a value to fold
-					if( i + 1 < end ) {
+				// if we have more values to fold
+				if( i + 1 < end ) {
 
-						while( true ) {
+					// keep going until we run out of values to fold
+					while( true ) {
 
-							// forward to next variable
-							(void) ++i;
+						// forward to next variable
+						(void) ++i;
 
-							// forward more (possibly) if in the masked case
-							if( masked && i < end ) {
+						// forward more (possibly) if in the masked case
+						if( masked && i < end ) {
+							assert( i < n );
+							process_current_i = utils::interpretMask< descr >(
+								internal::getCoordinates( mask ).assigned( i ),
+								internal::getRaw( mask ),
+								i
+							) && to_fold_coors.assigned( i );
+							while( !process_current_i ) {
+								(void) ++i;
+								if( i == end ) {
+									break;
+								}
+								assert( i < end );
 								assert( i < n );
 								process_current_i = utils::interpretMask< descr >(
 									internal::getCoordinates( mask ).assigned( i ),
 									internal::getRaw( mask ),
 									i
+								) && to_fold_coors.assigned( i );
+							}
+						}
+						if( !masked && i < end ) {
+							assert( i < n );
+							process_current_i = to_fold_coors.assigned( i );
+							while( !process_current_i ) {
+								(void) ++i;
+								if( i == end ) {
+									break;
+								}
+								assert( i < end );
+								assert( i < n );
+								process_current_i = to_fold_coors.assigned( i );
+							}
+						}
+
+						// stop if past end
+						if( i >= end ) {
+							break;
+						}
+
+#ifdef _DEBUG
+						std::cout << "\t processing index " << i << "\n";
+#endif
+
+						// store result of fold in local variable
+						RC local_rc;
+
+						// do fold
+						assert( i < n );
+						if( left ) {
+							local_rc = foldl< descr >(
+									local, internal::getRaw( to_fold )[ i ],
+									monoid.getOperator()
 								);
+						} else {
+							local_rc = foldr< descr >(
+									internal::getRaw( to_fold )[ i ], local,
+									monoid.getOperator()
+								);
+						}
+						assert( local_rc == SUCCESS );
+
+						// error propagation
+						if( local_rc != SUCCESS ) {
+							ret = local_rc;
+							break;
+						}
+					}
+				}
+
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+				// reduce all local folds into the global one
+				#pragma omp critical
+				{
+					// if non-empty, fold local variable into global one
+					if( !empty ) {
+						// local return type to avoid racing writes
+						RC local_rc;
+						if( left ) {
+							local_rc = foldl< descr >( global, local, monoid.getOperator() );
+						} else {
+							local_rc = foldr< descr >( local, global, monoid.getOperator() );
+						}
+						assert( local_rc == SUCCESS );
+						if( local_rc != SUCCESS ) {
+							ret = local_rc;
+						}
+					}
+				}
+			} // end pragma omp parallel
+#else
+			// in the sequential case, simply copy the locally computed reduced scalar
+			// into the output field
+			global = local;
+#endif
+
+			// done
+			return ret;
+		}
+
+#ifndef _H_GRB_REFERENCE_OMP_BLAS1
+		/**
+		 * A helper template class for selecting the right variant for
+		 * fold-from-vector-to-scalar.
+		 *
+		 * When the mask is structural, the returned value shall be zero, otherwise
+		 * it will be the byte size of \a MaskType.
+		 */
+		template< Descriptor descr, typename MaskType >
+		struct MaskWordSize {
+			static constexpr const size_t value = (descr & descriptors::structural)
+				? 0
+				: sizeof( MaskType );
+		};
+
+		/**
+		 * Specialisation for <tt>void</tt> mask types.
+		 *
+		 * Always returns zero.
+		 */
+		template< Descriptor descr >
+		struct MaskWordSize< descr, void > {
+			static constexpr const size_t value = 0;
+		};
+#endif
+
+		/**
+		 * Dispatches to any of the four above variants depending on asymptotic cost
+		 * analysis.
+		 */
+		template<
+			Descriptor descr = descriptors::no_operation,
+			bool masked, bool left, // if this is false, assumes right-looking fold
+			class Monoid,
+			typename IOType, typename InputType, typename MaskType,
+			typename Coords
+		>
+		RC fold_from_vector_to_scalar_generic(
+			IOType &fold_into,
+			const Vector< InputType, reference, Coords > &to_fold,
+			const Vector< MaskType, reference, Coords > &mask,
+			const Monoid &monoid
+		) {
+			// static sanity checks
+			static_assert( grb::is_monoid< Monoid >::value,
+				"grb::foldl can only be called using monoids. This "
+				"function should not have been called-- please submit a "
+				"bugreport." );
 
-								while( !process_current_i && i + 1 < end ) {
-									(void) ++i;
-									if( i < end ) {
-										assert( i < n );
-										process_current_i = utils::interpretMask< descr >(
-											internal::getCoordinates( mask ).assigned( i ),
-											internal::getRaw( mask ),
-											i
-										);
-									}
-								}
-							}
+			const size_t n  = internal::getCoordinates( to_fold ).size();
+			const size_t nz = internal::getCoordinates( to_fold ).nonzeroes();
 
-							// stop if past end
-							if( i >= end || !process_current_i ) {
-								break;
-							}
-							// store result of fold in local variable
-							RC rc;
+			// mask must be of equal size as input vector
+			if( masked && n != size( mask ) ) {
+				return MISMATCH;
+			}
 
-							assert( i < n );
-							if( left ) {
-								rc = foldl< descr >( local, internal::getRaw( to_fold )[ i ], op );
-							} else {
-								rc = foldr< descr >( internal::getRaw( to_fold )[ i ], local, op );
-							}
-							assert( rc == SUCCESS );
+			// density checks, if needed
+			if( (descr & descriptors::dense) ) {
+				if( nnz( to_fold ) < n ) {
+					return ILLEGAL;
+				}
+				if( masked && nnz( mask ) < size( mask ) ) {
+					return ILLEGAL;
+				}
+			}
 
-							// error propagation
-							if( rc != SUCCESS ) {
-								ret = rc;
-								break;
-							}
-						}
-					}
+			// handle trivial cases
+			if( n == 0 ) {
+				return SUCCESS;
+			}
+			if( nz == 0 ) {
+				return SUCCESS;
+			}
+			if( masked && !(descr & descriptors::invert_mask) &&
+				nnz( mask ) == 0
+			) {
+				return SUCCESS;
+			}
+			if( masked && (descr & descriptors::invert_mask) &&
+				(descr & descriptors::structural) &&
+				nnz( mask ) == n
+			) {
+				return SUCCESS;
+			}
 
-#ifdef _H_GRB_REFERENCE_OMP_BLAS1
-					#pragma omp critical
-					{
-						// if I am root
-						if( root == s ) {
-							// then I should be non-empty
-							assert( !empty );
-							// set global value to locally computed value
-							GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // one is only root if the local
-							global = local;                     // chunk is non-empty, in which case
-							GRB_UTIL_RESTORE_WARNINGS           // local will be initialised (above)
-						}
-					}
-					#pragma omp barrier
-					#pragma omp critical
-					{
-						// if non-root, fold local variable into global one
-						if( !empty && root != s ) {
-							RC rc;
-							if( left ) {
-								rc = foldl< descr >( global, local, op );
-							} else {
-								rc = foldr< descr >( local, global, op );
-							}
-							assert( rc == SUCCESS );
-							if( rc != SUCCESS ) {
-								ret = rc;
-							}
-						}
-					}
-				} // end pragma omp parallel for
-#else
-				global = local;
+			// some globals used during the folding
+			RC ret = SUCCESS;
+			typename Monoid::D3 global =
+				monoid.template getIdentity< typename Monoid::D3 >();
+
+			// dispatch, dense variant
+			if( ((descr & descriptors::dense) || nnz( to_fold ) == n) && (
+					!masked || (
+						(descr & descriptors::structural) &&
+						!(descr & descriptors::invert_mask) &&
+						nnz( mask ) == n
+					)
+				)
+			) {
+#ifdef _DEBUG
+				std::cout << "\t dispatching to dense variant\n";
 #endif
-			}
+				ret = fold_from_vector_to_scalar_dense< left >(
+					global, to_fold, monoid );
+			} else if( masked && (descr & descriptors::invert_mask ) ) {
+				// in this case we are forced to dispatch to O(n)
+#ifdef _DEBUG
+				std::cout << "\t forced dispatch to O(n) sparse variant\n";
+#endif
+				ret = fold_from_vector_to_scalar_fullLoopSparse< descr, true, left >(
+					global, to_fold, mask, monoid );
+			} else {
+				constexpr const size_t threeWs =
+					sizeof( typename Coords::StackType ) +
+					sizeof( typename Coords::ArrayType ) +
+					MaskWordSize< descr, MaskType >::value;
+				const size_t fullLoop = masked
+					? 2 * sizeof( typename Coords::ArrayType ) * n +
+						sizeof( MaskType ) * nnz( mask )
+					: sizeof( typename Coords::ArrayType ) * n;
+				const size_t vectorLoop = masked
+					? threeWs * nnz( to_fold )
+					: sizeof( typename Coords::StackType ) * nnz( to_fold );
+				const size_t maskLoop = masked
+					? threeWs * nnz( mask )
+					: std::numeric_limits< size_t >::max();
+				if( fullLoop >= vectorLoop && maskLoop >= vectorLoop ) {
+#ifdef _DEBUG
+					std::cout << "\t dispatching to vector-driven sparse variant\n";
+#endif
+					ret = fold_from_vector_to_scalar_vectorDriven< descr, masked, left >(
+						global, to_fold, mask, monoid );
+				} else if( vectorLoop >= fullLoop && maskLoop >= fullLoop ) {
+#ifdef _DEBUG
+					std::cout << "\t dispatching to O(n) sparse variant\n";
+#endif
+					ret = fold_from_vector_to_scalar_fullLoopSparse< descr, masked, left >(
+						global, to_fold, mask, monoid );
+				} else {
+					assert( maskLoop < fullLoop && maskLoop < vectorLoop );
+					assert( masked );
 #ifdef _DEBUG
-			std::cout << "Accumulating " << global << " into " << fold_into << " using foldl\n";
+					std::cout << "\t dispatching to mask-driven sparse variant\n";
 #endif
+					ret = fold_from_vector_to_scalar_maskDriven< descr, left >(
+						global, to_fold, mask, monoid );
+				}
+			}
+
 			// accumulate
+#ifdef _DEBUG
+			std::cout << "\t accumulating " << global << " into " << fold_into << "\n";
+#endif
 			if( ret == SUCCESS ) {
-				ret = foldl< descr >( fold_into, global, op );
+				if( left ) {
+					ret = foldl< descr >( fold_into, global, monoid.getOperator() );
+				} else {
+					ret = foldr< descr >( global, fold_into, monoid.getOperator() );
+				}
 			}
 
 			// done
@@ -402,7 +786,7 @@ namespace grb {
 				// significant imbalance
 				#pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
 #endif
-				for( size_t i = 0; i < n; ++i ) {
+				for( size_t i = 0; i < coors.nonzeroes(); ++i ) {
 					const size_t index = coors.index( i );
 					if( !( m_coors->template mask< descr >( index, m ) ) ) {
 						continue;
@@ -418,7 +802,7 @@ namespace grb {
 				#pragma omp parallel
 				{
 					size_t start, end;
-					config::OMP::localRange( start, end, 0, n );
+					config::OMP::localRange( start, end, 0, coors.nonzeroes() );
 #else
 					const size_t start = 0;
 					const size_t end = n;
@@ -462,7 +846,7 @@ namespace grb {
 
 			// input checking is done by fold_from_scalar_to_vector_generic
 			// we hence here only assert
-			assert( m_coors.size() != n );
+			assert( m_coors.size() == n );
 			assert( !dense_descr || nnz( vector ) == n );
 			assert( !dense_descr || m_coors.nonzeroes() == n );
 
@@ -707,9 +1091,6 @@ namespace grb {
 			if( !sparse && nnz( to_fold ) < n ) {
 				return ILLEGAL;
 			}
-			if( masked && !sparse && nnz( *m ) < n ) {
-				return ILLEGAL;
-			}
 			if( phase == RESIZE ) {
 				return SUCCESS;
 			}
@@ -773,7 +1154,7 @@ namespace grb {
 					if( left ) {
 #ifdef _DEBUG
 						std::cout << "fold_from_vector_to_vector_generic: using eWiseLambda, "
-							<< "foldl, using to_fold's sparsity\n";
+							<< "foldl, using the mask's sparsity structure\n";
 #endif
 						return eWiseLambda(
 							[ &fold_into, &to_fold, &op ]( const size_t i ) {
@@ -788,7 +1169,7 @@ namespace grb {
 					} else {
 #ifdef _DEBUG
 						std::cout << "fold_from_vector_to_vector_generic: using eWiseLambda, "
-							<< "foldl, using to_fold's sparsity\n";
+							<< "foldr, using the mask's sparsity structure\n";
 #endif
 						return eWiseLambda(
 							[ &fold_into, &to_fold, &op ]( const size_t i ) {
@@ -908,11 +1289,10 @@ namespace grb {
 					}*/
 				} else {
 #ifdef _DEBUG
-					std::cout << "fold_from_vector_to_vector_generic (non-monoid): using "
-						<< "specialised code to merge two sparse vectors and/or to "
-						<< "handle output masks\n";
+					std::cout << "fold_from_vector_to_vector_generic: using specialised "
+						<< "code to merge two sparse vectors and, potentially, "
+						<< "output masks\n";
 #endif
-					assert( !monoid );
 					const IType * __restrict__ const tf_raw = internal::getRaw( to_fold );
 					IOType * __restrict__ const fi_raw = internal::getRaw( fold_into );
 					auto &fi = internal::getCoordinates( fold_into );
@@ -1143,6 +1523,7 @@ namespace grb {
 		const Vector< MaskType, reference, Coords > &mask,
 		IOType &beta,
 		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< InputType >::value &&
 			!grb::is_object< IOType >::value &&
@@ -1150,6 +1531,7 @@ namespace grb {
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		(void) phase;
 		// static sanity checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< IOType, InputType >::value ), "grb::foldr",
@@ -1173,11 +1555,11 @@ namespace grb {
 		if( size( mask ) > 0 ) {
 			return internal::template fold_from_vector_to_scalar_generic<
 				descr, true, false
-			>( beta, x, mask, monoid.getOperator() );
+			>( beta, x, mask, monoid );
 		} else {
 			return internal::template fold_from_vector_to_scalar_generic<
 				descr, false, false
-			>( beta, x, mask, monoid.getOperator() );
+			>( beta, x, mask, monoid );
 		}
 	}
 
@@ -1190,12 +1572,15 @@ namespace grb {
 		const Vector< InputType, reference, Coords > &x,
 		IOType &beta,
 		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< InputType >::value &&
 			!grb::is_object< IOType >::value &&
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		(void) phase;
+
 		// static sanity checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
 			std::is_same< IOType, InputType >::value ), "grb::foldr",
@@ -1216,7 +1601,7 @@ namespace grb {
 		grb::Vector< bool, reference, Coords > empty_mask( 0 );
 		return internal::template fold_from_vector_to_scalar_generic<
 			descr, false, false
-		>( beta, x, empty_mask, monoid.getOperator() );
+		>( beta, x, empty_mask, monoid );
 	}
 
 	/**
@@ -1496,6 +1881,12 @@ namespace grb {
 			return MISMATCH;
 		}
 
+		const size_t n = size( x );
+
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
+
 #ifdef _DEBUG
 		std::cout << "In foldr ([T]<-[T])\n";
 #endif
@@ -1563,6 +1954,10 @@ namespace grb {
 		if( n != size( y ) || n != size( m ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( size( m ) > 0 && nnz( m ) != n ) { return ILLEGAL; }
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		if( nnz( x ) < n || nnz( y ) < n ) {
 			return internal::fold_from_vector_to_vector_generic<
@@ -1680,6 +2075,9 @@ namespace grb {
 		if( n != size( y ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		if( nnz( x ) < n || nnz( y ) < n ) {
@@ -1745,6 +2143,10 @@ namespace grb {
 		if( n != size( y ) || n != size( m ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( size( m ) > 0 && nnz( m ) != n ) { return ILLEGAL; }
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		if( nnz( x ) < n || nnz( y ) < n ) {
 			return internal::fold_from_vector_to_vector_generic<
@@ -1851,18 +2253,26 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D1, IOType >::value ), "grb::foldl",
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
 			"called with a vector x of a type that does not match the first domain "
 			"of the given operator" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D2, InputType >::value ), "grb::foldl",
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
 			"called on a vector y of a type that does not match the second domain "
 			"of the given operator" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D3, IOType >::value ), "grb::foldl",
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
 			"called on a vector x of a type that does not match the third domain "
 			"of the given operator" );
 
+		// dynamic checks
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+		}
+
 		// if no monoid was given, then we can only handle dense vectors
 		auto null_coor = &( internal::getCoordinates( x ) );
 		null_coor = nullptr;
@@ -1875,7 +2285,7 @@ namespace grb {
 				descr, true, false, false, false, void
 			>( x, nullptr, null_coor, beta, op, phase );
 		}
-}
+	}
 
 	/**
 	 * For all elements in an ALP/GraphBLAS vector \a x, fold the value
@@ -1905,24 +2315,45 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D1, IOType >::value ), "grb::foldl",
+				std::is_same< typename Op::D1, IOType >::value ),
+			"grb::foldl",
 			"called with a vector x of a type that does not match the first domain "
 			"of the given operator" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D2, InputType >::value ), "grb::foldl",
+				std::is_same< typename Op::D2, InputType >::value ),
+			"grb::foldl",
 			"called on a vector y of a type that does not match the second domain "
 			"of the given operator" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Op::D3, IOType >::value ), "grb::foldl",
+				std::is_same< typename Op::D3, IOType >::value ),
+			"grb::foldl",
 			"called on a vector x of a type that does not match the third domain "
 			"of the given operator" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting ) ||
-			std::is_same< bool, MaskType >::value ),
+				std::is_same< bool, MaskType >::value ),
 			"grb::foldl (reference, vector <- scalar, masked)",
 			"provided mask does not have boolean entries" );
+
+		// check empty mask
 		if( size( m ) == 0 ) {
 			return foldl< descr >( x, beta, op, phase );
 		}
+
+		// dynamic checks
+		const size_t n = size( x );
+		if( size( m ) != n ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
 		const auto m_coor = &( internal::getCoordinates( m ) );
 		const auto m_p = internal::getRaw( m );
 		if( nnz( x ) < size( x ) ) {
@@ -2098,10 +2529,20 @@ namespace grb {
 			std::is_same< bool, MaskType >::value ),
 			"grb::foldl (reference, vector <- scalar, masked, monoid)",
 			"provided mask does not have boolean entries" );
+
+		// check for empty mask
 		if( size( m ) == 0 ) {
 			return foldl< descr >( x, beta, monoid, phase );
 		}
 
+		// dynamic checks
+		const size_t n = size( x );
+		if( n != size( m ) ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
 		// delegate to generic case
 		auto m_coor = &( internal::getCoordinates( m ) );
 		auto m_p = internal::getRaw( m );
@@ -2224,6 +2665,9 @@ namespace grb {
 		if( n != size( y ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		// all OK, execute
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
@@ -2347,6 +2791,9 @@ namespace grb {
 		if( n != size( y ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		// all OK, execute
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
@@ -2415,6 +2862,10 @@ namespace grb {
 		if( n != size( y ) || n != size( m ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( size( m ) > 0 && nnz( m ) != n ) { return ILLEGAL; }
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		// all OK, execute
 		if( nnz( x ) < n || nnz( y ) < n ) {
@@ -2482,6 +2933,10 @@ namespace grb {
 		if( n != size( y ) || n != size( m ) ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( size( m ) > 0 && nnz( m ) != n ) { return ILLEGAL; }
+			if( nnz( x ) != n || nnz( y ) != n ) { return ILLEGAL; }
+		}
 
 		// all OK, execute
 		if( nnz( x ) < n || nnz( y ) < n ) {
@@ -2801,17 +3256,19 @@ namespace grb {
 					// part that may or may not be vectorised (can we do something about this??)
 					for( size_t i = 0; i < block_size; ++i ) {
 						if( !masked || mask[ i ] ) {
+							if( y_m[ i ] || monoid ) {
 #ifndef _H_GRB_REFERENCE_OMP_BLAS1
-							(void) z_coors.assign( offsets[ i ] );
+								(void) z_coors.assign( offsets[ i ] );
 #else
-							if( !z_coors.asyncAssign( offsets[ i ], update ) ) {
-								(void) ++asyncAssigns;
+								if( !z_coors.asyncAssign( offsets[ i ], update ) ) {
+									(void) ++asyncAssigns;
 #ifdef _DEBUG
-								std::cout << "\t\t now made " << asyncAssigns
-									<< " calls to asyncAssign; " << "added index " << offsets[ i ] << "\n";
+									std::cout << "\t\t now made " << asyncAssigns << " calls to "
+										<< "asyncAssign; " << "added index " << offsets[ i ] << "\n";
 #endif
-							}
+								}
 #endif
+							}
 						}
 					}
 					// perform scatter
@@ -2829,8 +3286,9 @@ namespace grb {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
 					if( asyncAssigns > maxAsyncAssigns - block_size ) {
 #ifdef _DEBUG
-						std::cout << "\t\t " << omp_get_thread_num() << ": clearing local update at block "
-							<< b << ". It locally holds " << asyncAssigns << " entries. "
+						std::cout << "\t\t " << omp_get_thread_num() << ": "
+							<< "clearing local update at block " << b << ". "
+							<< "It locally holds " << asyncAssigns << " entries. "
 							<< "Update is at " << ( (void *)update ) << "\n";
 #endif
 #ifndef NDEBUG
@@ -2852,7 +3310,7 @@ namespace grb {
 #endif
 					for( ; k < loop_coors.nonzeroes(); ++k ) {
 						const size_t index = loop_coors.index( k );
-						if( masked && mask_coors->template mask< descr >( index, mask_p ) ) {
+						if( masked && !mask_coors->template mask< descr >( index, mask_p ) ) {
 							continue;
 						}
 						RC rc = SUCCESS;
@@ -2874,7 +3332,16 @@ namespace grb {
 						}
 #endif
 						if( x_scalar || y_scalar || chk_coors.assigned( index ) ) {
-							rc = apply( z_p[ index ], x_p[ index ], y_p[ index ], op );
+							// if exactly one of the inputs is a scalar, then the other input is a
+							// vector and we are looping over the nonzeroes of that vector
+							// if both inputs are vectors, then the last condition means we are
+							// iterating over a position where both vectors overlap
+							rc = apply(
+									z_p[ index ],
+									x_scalar ? *x_p : x_p[ index ],
+									y_scalar ? *y_p : y_p[ index ],
+									op
+								);
 						} else if( monoid ) {
 							if( swap ) {
 								z_p[ index ] = x_scalar ?
@@ -3064,7 +3531,7 @@ namespace grb {
 							if( loop_coors.assigned( index ) ) {
 								continue;
 							}
-							if( masked && mask_coors->template mask< descr >( index, mask_p ) ) {
+							if( masked && !mask_coors->template mask< descr >( index, mask_p ) ) {
 								continue;
 							}
 #ifndef _H_GRB_REFERENCE_OMP_BLAS1
@@ -3101,13 +3568,18 @@ namespace grb {
 			return SUCCESS;
 		}
 
+		/**
+		 * \internal Whenever this function is called, the z_coors is assumed to be
+		 *           cleared.
+		 */
 		template<
 			bool left_scalar, bool right_scalar, bool left_sparse, bool right_sparse,
 			Descriptor descr, class OP,
 			typename OutputType, typename MaskType,
 			typename InputType1, typename InputType2
 		>
-		RC masked_apply_generic( OutputType * const z_p,
+		RC masked_apply_generic(
+			OutputType * const z_p,
 			Coordinates< reference > &z_coors,
 			const MaskType * const mask_p,
 			const Coordinates< reference > &mask_coors,
@@ -3136,6 +3608,7 @@ namespace grb {
 			assert( !left_sparse || left_identity != nullptr );
 			assert( !right_sparse || right_coors != nullptr );
 			assert( !right_sparse || right_identity != nullptr );
+			assert( z_coors.nonzeroes() == 0 );
 
 #ifdef _DEBUG
 			std::cout << "\tinternal::masked_apply_generic called with nnz(mask)="
@@ -3170,9 +3643,6 @@ namespace grb {
 				size_t_block_size :
 				op_block_size;
 
-			// whether we have a dense hint
-			constexpr bool dense = descr & descriptors::dense;
-
 			if( bigLoop ) {
 #ifdef _DEBUG
 				std::cerr << "\t in bigLoop variant\n";
@@ -3214,7 +3684,7 @@ namespace grb {
 					// vectorised code
 					for( size_t b = start; b < end; ++b ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
-						size_t i = start * block_size;
+						const size_t i = b * block_size;
 #endif
 						for( size_t k = 0; k < block_size; ++k ) {
 							const size_t index = i + k;
@@ -3264,15 +3734,13 @@ namespace grb {
 							const size_t index = i + k;
 							assert( index < n );
 							if( mask_b[ k ] ) {
-								if( !dense ) {
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
-									if( !z_coors.asyncAssign( index, update ) ) {
-										(void) ++asyncAssigns;
-									}
+								if( !z_coors.asyncAssign( index, update ) ) {
+									(void) ++asyncAssigns;
+								}
 #else
-									(void) z_coors.assign( index );
+								(void) z_coors.assign( index );
 #endif
-								}
 								*( z_p + index ) = z_b[ k ];
 							}
 						}
@@ -3281,8 +3749,9 @@ namespace grb {
 							(void) z_coors.joinUpdate( update );
 							asyncAssigns = 0;
 						}
-#endif
+#else
 						i += block_size;
+#endif
 					}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
 					#pragma omp single nowait
@@ -3290,19 +3759,22 @@ namespace grb {
 					// scalar coda
 					for( size_t i = end * block_size; i < n; ++i ) {
 						if( mask_coors.template mask< descr >( i, mask_p ) ) {
-							if( !dense ) {
-#ifdef _H_GRB_REFERENCE_OMP_BLAS1
-								if( !z_coors.asyncAssign( i, update ) ) {
-									(void) ++asyncAssigns;
-								}
-								if( asyncAssigns == maxAsyncAssigns ) {
-									(void) z_coors.joinUpdate( update );
-									asyncAssigns = 0;
+							if( left_sparse && right_sparse ) {
+								if( !left_coors->assigned( i ) && !right_coors->assigned( i ) ) {
+									continue;
 								}
+							}
+#ifdef _H_GRB_REFERENCE_OMP_BLAS1
+							if( !z_coors.asyncAssign( i, update ) ) {
+								(void) ++asyncAssigns;
+							}
+							if( asyncAssigns == maxAsyncAssigns ) {
+								(void) z_coors.joinUpdate( update );
+								asyncAssigns = 0;
+							}
 #else
-								(void) z_coors.assign( i );
+							(void) z_coors.assign( i );
 #endif
-							}
 							const InputType1 * const x_e = left_scalar ?
 								x_p :
 								(
@@ -3420,19 +3892,17 @@ namespace grb {
 						}
 						for( size_t t = 0; t < block_size; ++t ) {
 							if( mask_b[ t ] ) {
-								if( !dense ) {
 #ifndef _H_GRB_REFERENCE_OMP_BLAS1
-									(void) z_coors.assign( indices[ t ] );
+								(void) z_coors.assign( indices[ t ] );
 #else
-									if( !z_coors.asyncAssign( indices[ t ], update ) ) {
-										(void) ++asyncAssigns;
+								if( !z_coors.asyncAssign( indices[ t ], update ) ) {
+									(void) ++asyncAssigns;
 #ifdef _DEBUG
-										std::cout << "\t\t now made " << asyncAssigns << " calls to asyncAssign; "
-											<< "added index " << indices[ t ] << "\n";
-#endif
-									}
+									std::cout << "\t\t now made " << asyncAssigns << " calls to asyncAssign; "
+										<< "added index " << indices[ t ] << "\n";
 #endif
 								}
+#endif
 								GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED  // z_b is computed from x_b and
 								*( z_p + indices[ t ] ) = z_b[ t ];  // y_b, which are both initialised
 								GRB_UTIL_RESTORE_WARNINGS            // if mask_b is true
@@ -3459,29 +3929,25 @@ namespace grb {
 									continue;
 								}
 							}
-							if( !dense ) {
 #ifndef _H_GRB_REFERENCE_OMP_BLAS1
-								(void) z_coors.assign( i );
+							(void) z_coors.assign( i );
 #else
-								if( !z_coors.asyncAssign( i, update ) ) {
-									(void) ++asyncAssigns;
-								}
-								if( asyncAssigns == maxAsyncAssigns ) {
-									(void) z_coors.joinUpdate( update );
-									asyncAssigns = 0;
-								}
-#endif
+							if( !z_coors.asyncAssign( i, update ) ) {
+								(void) ++asyncAssigns;
+							}
+							if( asyncAssigns == maxAsyncAssigns ) {
+								(void) z_coors.joinUpdate( update );
+								asyncAssigns = 0;
 							}
+#endif
 							const InputType1 * const x_e = left_scalar ?
-								x_p :
-								(
+								x_p : (
 									(!left_sparse || left_coors->assigned( i )) ?
 										x_p + i :
 										left_identity
-									);
+								);
 							const InputType2 * const y_e = right_scalar ?
-								y_p :
-								(
+								y_p : (
 									(!right_sparse || right_coors->assigned( i )) ?
 									y_p + i :
 									right_identity
@@ -3501,49 +3967,7 @@ namespace grb {
 	} // end namespace ``grb::internal''
 
 	/**
-	 * Calculates the element-wise operation on one scalar to elements of one
-	 * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
-	 * output vectors must be of equal length.
-	 *
-	 * The vectors \a x or \a y may not be sparse.
-	 *
-	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
-	 * the call to this function completes equals \f$ x_i \odot \beta \f$.
-	 *
-	 * \warning Use of sparse vectors is only supported in full generality
-	 *          when applied via a monoid or semiring; otherwise, there is
-	 *          no concept for correctly interpreting any missing vector
-	 *          elements during the requested computation.
-	 * \note    When applying element-wise operators on sparse vectors
-	 *          using semirings, there is a difference between interpreting missing
-	 *          values as an annihilating identity or as a neutral identity--
-	 *          intuitively, such identities are known as `zero' or `one',
-	 *          respectively. As a consequence, there are three different variants
-	 *          for element-wise operations whose names correspond to their
-	 *          intuitive meanings w.r.t. those identities:
-	 *            -# eWiseAdd (neutral),
-	 *            -# eWiseMul (annihilating), and
-	 *            -# eWiseApply using monoids (neutral).
-	 *          An eWiseAdd with some semiring and an eWiseApply using its additive
-	 *          monoid are totally equivalent.
-	 *
-	 * @tparam descr      The descriptor to be used. Equal to
-	 *                    descriptors::no_operation if left unspecified.
-	 * @tparam OP         The operator to use.
-	 * @tparam InputType1 The value type of the left-hand vector.
-	 * @tparam InputType2 The value type of the right-hand scalar.
-	 * @tparam OutputType The value type of the ouput vector.
-	 *
-	 * @param[in]   x   The left-hand input vector.
-	 * @param[in]  beta The right-hand input scalar.
-	 * @param[out]  z   The pre-allocated output vector.
-	 * @param[in]   op  The operator to use.
-	 *
-	 * @return grb::MISMATCH Whenever the dimensions of \a x and \a z do not
-	 *                       match. All input data containers are left untouched
-	 *                       if this exit code is returned; it will be as though
-	 *                       this call was never made.
-	 * @return grb::SUCCESS  On successful completion of this call.
+	 * Computes \f$ z = x \odot \beta \f$, out of place, operator variant.
 	 *
 	 * \parblock
 	 * \par Performance semantics
@@ -3585,6 +4009,19 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
 #ifdef _DEBUG
 		std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n";
 #endif
@@ -3595,6 +4032,10 @@ namespace grb {
 		if( internal::getCoordinates( x ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -3629,12 +4070,141 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, operator version.
 	 *
-	 * Specialisation for \a x and \a y scalar, operator version.
+	 * \todo Performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, reference, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+#ifdef _DEBUG
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		if( (descr & descriptors::dense) && nnz( z ) < size( z ) ) {
+			return ILLEGAL;
+		}
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, val );
+		return ret;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, masked operator
+	 * version.
+	 *
+	 * \todo Performance semantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class OP,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
+		typename Coords
+	>
+	RC eWiseApply(
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &mask,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const OP &op = OP(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if<
+			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_operator< OP >::value, void
+		>::type * const = nullptr
+	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+		// check trivial dispatch
+		if( size( mask ) == 0 ) {
+			return eWiseApply< descr >( z, alpha, beta, op, phase );
+		}
+
+		// dynamic checks
+		if( size( mask ) != size( z ) ) {
+			return MISMATCH;
+		}
+		if( (descr & descriptors::dense) &&
+			( nnz( z ) < size( z ) || nnz( mask ) < size( mask ) )
+		) {
+			return ILLEGAL;
+		}
+
+		// check trivial dispatch
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
+		typename OP::D3 val;
+		RC ret = apply< descr >( val, alpha, beta, op );
+		ret = ret ? ret : set< descr >( z, mask, val );
+		return ret;
+	}
+
+	/**
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, monoid version.
+	 *
+	 * \todo Add performance semantics
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -3642,66 +4212,94 @@ namespace grb {
 		Vector< OutputType, reference, Coords > &z,
 		const InputType1 alpha,
 		const InputType2 beta,
-		const OP &op = OP(),
+		const Monoid &monoid = Monoid(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
-			grb::is_operator< OP >::value, void
+			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
 #ifdef _DEBUG
-		std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
 #endif
-		if( phase == RESIZE ) {
-			return SUCCESS;
-		}
-		assert( phase == EXECUTE );
-
-		typename OP::D3 val;
-		RC ret = apply< descr >( val, alpha, beta, op );
-		ret = ret ? ret : set< descr >( z, val );
-		return ret;
+		// simply delegate to operator variant
+		return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase );
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = \alpha \odot \beta \f$, out of place, monoid version.
 	 *
-	 * Specialisation for \a x and \a y scalar, monoid version.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
-		typename OutputType, typename InputType1, typename InputType2,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
+		typename OutputType, typename MaskType,
+		typename InputType1, typename InputType2,
 		typename Coords
 	>
 	RC eWiseApply(
 		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &mask,
 		const InputType1 alpha,
 		const InputType2 beta,
 		const Monoid &monoid = Monoid(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< OutputType >::value &&
+			!grb::is_object< MaskType >::value &&
 			!grb::is_object< InputType1 >::value &&
 			!grb::is_object< InputType2 >::value &&
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
-		std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+		std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid variant\n";
 #endif
 		// simply delegate to operator variant
-		return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase );
+		return eWiseApply< descr >( z, mask, alpha, beta, monoid.getOperator(),
+			phase );
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot \beta \f$, out of place, masked operator variant.
 	 *
-	 * Specialisation for scalar \a y, masked operator version.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -3720,6 +4318,22 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using operator)\n";
 #endif
@@ -3736,6 +4350,11 @@ namespace grb {
 		if( internal::getCoordinates( mask ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -3782,12 +4401,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot y \f$, out of place, monoid variant.
 	 *
-	 * Monoid version.
+	 * \todo Add performance semantics.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -3803,6 +4423,19 @@ namespace grb {
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
 #ifdef _DEBUG
 		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
 #endif
@@ -3814,6 +4447,11 @@ namespace grb {
 		if( internal::getCoordinates( y ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -3851,12 +4489,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, monoid version.
 	 *
-	 * Specialisation for scalar \a x. Monoid version.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -3872,6 +4511,19 @@ namespace grb {
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
 #ifdef _DEBUG
 		std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
 #endif
@@ -3881,6 +4533,10 @@ namespace grb {
 		if( internal::getCoordinates( y ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -3911,9 +4567,9 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot \beta \f$, out of place, monoid variant.
 	 *
-	 * Specialisation for scalar \a y. Monoid version.
+	 * \todo Add performance semantics.
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -3921,7 +4577,8 @@ namespace grb {
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
-	RC eWiseApply( Vector< OutputType, reference, Coords > &z,
+	RC eWiseApply(
+		Vector< OutputType, reference, Coords > &z,
 		const Vector< InputType1, reference, Coords > &x,
 		const InputType2 beta,
 		const Monoid &monoid = Monoid(),
@@ -3932,6 +4589,19 @@ namespace grb {
 				grb::is_monoid< Monoid >::value,
 			void >::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
 #ifdef _DEBUG
 		std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
 #endif
@@ -3940,6 +4610,10 @@ namespace grb {
 		if( internal::getCoordinates( x ).size() != n ) {
 			return MISMATCH;
 		}
+		if( (descr & descriptors::dense) ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -3970,17 +4644,19 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot y \f$, out of place, masked monoid variant.
 	 *
-	 * Masked monoid version.
+	 * \todo Add performance semantics.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
 	>
-	RC eWiseApply( Vector< OutputType, reference, Coords > &z,
+	RC eWiseApply(
+		Vector< OutputType, reference, Coords > &z,
 		const Vector< MaskType, reference, Coords > &mask,
 		const Vector< InputType1, reference, Coords > &x,
 		const Vector< InputType2, reference, Coords > &y,
@@ -3994,6 +4670,22 @@ namespace grb {
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
 #endif
@@ -4012,6 +4704,12 @@ namespace grb {
 		if( internal::getCoordinates( mask ).size() != n ) {
 			return MISMATCH;
 		}
+		if( (descr & descriptors::dense) ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -4081,12 +4779,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, masked monoid variant.
 	 *
-	 * Specialisation for scalar \a x. Masked monoid version.
+	 * \todo Add performance semantics.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -4105,6 +4804,22 @@ namespace grb {
 			grb::is_monoid< Monoid >::value,
 		void >::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
 #endif
@@ -4120,6 +4835,11 @@ namespace grb {
 		if( internal::getCoordinates( mask ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -4155,12 +4875,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot \beta \f$, out of place, masked monoid variant.
 	 *
-	 * Specialisation for scalar \a y. Masked monoid version.
+	 * \todo Add performance semantics.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Monoid,
+		Descriptor descr = descriptors::no_operation,
+		class Monoid,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -4179,6 +4900,22 @@ namespace grb {
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given monoid" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
 #endif
@@ -4194,6 +4931,11 @@ namespace grb {
 		if( internal::getCoordinates( mask ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -4225,47 +4967,7 @@ namespace grb {
 	}
 
 	/**
-	 * Calculates the element-wise operation on one scalar to elements of one
-	 * vector, \f$ z = \alpha .* y \f$, using the given operator. The input and
-	 * output vectors must be of equal length.
-	 *
-	 * The vectors \a x or \a y may not be sparse.
-	 *
-	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
-	 * the call to this function completes equals \f$ \alpha \odot y_i \f$.
-	 *
-	 * \warning Use of sparse vectors is only supported in full generality
-	 *          when applied via a monoid or semiring; otherwise, there is
-	 *          no concept for correctly interpreting any missing vector
-	 *          elements during the requested computation.
-	 * \note    When applying element-wise operators on sparse vectors
-	 *          using semirings, there is a difference between interpreting missing
-	 *          values as an annihilating identity or as a neutral identity--
-	 *          intuitively, identities are known as `zero' or `one',
-	 *          respectively. As a consequence, there are three different variants
-	 *          for element-wise operations whose names correspond to their
-	 *          intuitive meanings w.r.t. those identities:
-	 *            -# eWiseAdd,
-	 *            -# eWiseMul, and
-	 *            -# eWiseMulAdd.
-	 *
-	 * @tparam descr The descriptor to be used. Equal to descriptors::no_operation
-	 *               if left unspecified.
-	 * @tparam OP    The operator to use.
-	 * @tparam InputType1 The value type of the left-hand scalar.
-	 * @tparam InputType2 The value type of the right-hand side vector.
-	 * @tparam OutputType The value type of the ouput vector.
-	 *
-	 * @param[in]  alpha The left-hand scalar.
-	 * @param[in]   y    The right-hand input vector.
-	 * @param[out]  z    The pre-allocated output vector.
-	 * @param[in]   op   The operator to use.
-	 *
-	 * @return grb::MISMATCH Whenever the dimensions of \a y and \a z do not
-	 *                       match. All input data containers are left untouched
-	 *                       if this exit code is returned; it will be as though
-	 *                       this call was never made.
-	 * @return grb::SUCCESS  On successful completion of this call.
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, operator version.
 	 *
 	 * \parblock
 	 * \par Performance semantics
@@ -4290,7 +4992,8 @@ namespace grb {
 	 * \endparblock
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -4307,27 +5010,47 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
 #ifdef _DEBUG
 		std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n";
 #endif
-		// sanity check
+		// dynamic sanity checks
 		const size_t n = internal::getCoordinates( z ).size();
 		if( internal::getCoordinates( y ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+		}
 
-		if( phase == RESIZE ) {
+		// check for trivial op
+		if( n == 0 ) {
 			return SUCCESS;
 		}
-		assert( phase == EXECUTE );
 
 		// check if we can dispatch
-		if( static_cast< const void * >( &z ) ==
-			static_cast< const void * >( &y )
-		) {
+		if( getID( z ) == getID( y ) ) {
 			return foldr< descr >( alpha, z, op );
 		}
 
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
+
 		// check for dense variant
 		if( (descr & descriptors::dense) ||
 			internal::getCoordinates( y ).nonzeroes() == n
@@ -4341,6 +5064,7 @@ namespace grb {
 		}
 
 		// we are in the sparse variant
+		internal::getCoordinates( z ).clear();
 		const bool * const null_mask = nullptr;
 		const Coords * const null_coors = nullptr;
 		return internal::sparse_apply_generic< false, false, true, false, descr >(
@@ -4352,12 +5076,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = \alpha \odot y \f$, out of place, masked operator version.
 	 *
-	 * Specialisation for scalar \a x. Masked operator version.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -4377,6 +5102,22 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator variant)\n";
 #endif
@@ -4385,29 +5126,34 @@ namespace grb {
 			return eWiseApply< descr >( z, alpha, y, op );
 		}
 
-		// sanity check
-		const size_t n = internal::getCoordinates( z ).size();
+		// check delegate to unmasked
+		const size_t n = internal::getCoordinates( mask ).size();
+		const auto &mask_coors = internal::getCoordinates( mask );
+		if( (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) &&
+			mask_coors.nonzeroes() == n
+		) {
+			return eWiseApply< descr >( z, alpha, y, op );
+		}
+
+		// sanity checks
 		if( internal::getCoordinates( y ).size() != n ) {
 			return MISMATCH;
 		}
-		if( internal::getCoordinates( mask ).size() != n ) {
+		if( internal::getCoordinates( z ).size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// check delegate to unmasked
-		const auto &mask_coors = internal::getCoordinates( mask );
-		if( (descr & descriptors::structural) &&
-			!(descr & descriptors::invert_mask) &&
-			mask_coors.nonzeroes() == n
-		) {
-			return eWiseApply< descr >( z, alpha, y, op );
-		}
-
 		auto &z_coors = internal::getCoordinates( z );
 		OutputType * const z_p = internal::getRaw( z );
 		const MaskType * const mask_p = internal::getRaw( mask );
@@ -4433,48 +5179,7 @@ namespace grb {
 	}
 
 	/**
-	 * Calculates the element-wise operation on elements of two vectors,
-	 * \f$ z = x .* y \f$, using the given operator. The vectors must be
-	 * of equal length.
-	 *
-	 * The vectors \a x or \a y may not be sparse.
-	 *
-	 * For all valid indices \a i of \a z, its element \f$ z_i \f$ after
-	 * the call to this function completes equals \f$ x_i \odot y_i \f$.
-	 *
-	 * \warning Use of sparse vectors is only supported in full generality
-	 *          when applied via a monoid or semiring; otherwise, there is
-	 *          no concept for correctly interpreting any missing vector
-	 *          elements during the requested computation.
-	 * \note    When applying element-wise operators on sparse vectors
-	 *          using semirings, there is a difference between interpreting missing
-	 *          values as an annihilating identity or as a neutral identity--
-	 *          intuitively, identities are known as `zero' or `one',
-	 *          respectively. As a consequence, there are three different variants
-	 *          for element-wise operations whose names correspond to their
-	 *          intuitive meanings w.r.t. those identities:
-	 *            -# eWiseAdd,
-	 *            -# eWiseMul, and
-	 *            -# eWiseMulAdd.
-	 *
-	 * @tparam descr The descriptor to be used (descriptors::no_operation if left
-	 *               unspecified).
-	 * @tparam OP    The operator to use.
-	 * @tparam InputType1 The value type of the left-hand side vector.
-	 * @tparam InputType2 The value type of the right-hand side vector.
-	 * @tparam OutputType The value type of the ouput vector.
-	 *
-	 * @param[in]  x  The left-hand input vector. May not equal \a y.
-	 * @param[in]  y  The right-hand input vector. May not equal \a x.
-	 * @param[out] z  The pre-allocated output vector.
-	 * @param[in]  op The operator to use.
-	 *
-	 * @return grb::ILLEGAL  When \a x equals \a y.
-	 * @return grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z
-	 *                       do not match. All input data containers are left
-	 *                       untouched if this exit code is returned; it will
-	 *                       be as though this call was never made.
-	 * @return grb::SUCCESS  On successful completion of this call.
+	 * Computes \f$ z = x \odot y \f$, out of place, operator variant.
 	 *
 	 * \parblock
 	 * \par Performance semantics
@@ -4501,7 +5206,8 @@ namespace grb {
 	 * \endparblock
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -4517,10 +5223,23 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
 #ifdef _DEBUG
 		std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
 #endif
-		// sanity check
+		// dynamic sanity checks
 		auto &z_coors = internal::getCoordinates( z );
 		const size_t n = z_coors.size();
 		if( internal::getCoordinates( x ).size() != n ||
@@ -4531,17 +5250,25 @@ namespace grb {
 #endif
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+		}
 
-		// check for possible shortcuts
-		if( static_cast< const void * >( &x ) == static_cast< const void * >( &y ) &&
-			is_idempotent< OP >::value
-		) {
+		// trivial dispatch
+		if( n == 0 ) {
+			return SUCCESS;
+		}
+
+		// check for possible shortcuts, after dynamic checks
+		if( getID( x ) == getID( y ) && is_idempotent< OP >::value ) {
 			return set< descr >( z, x, phase );
 		}
-		if( static_cast< const void * >( &x ) == static_cast< void * >( &z ) ) {
+		if( getID( x ) == getID( z ) ) {
 			return foldl< descr >( z, y, op, phase );
 		}
-		if( static_cast< const void * >( &y ) == static_cast< void * >( &z ) ) {
+		if( getID( y ) == getID( z ) ) {
 			return foldr< descr >( x, z, op, phase );
 		}
 
@@ -4602,12 +5329,13 @@ namespace grb {
 	}
 
 	/**
-	 * Computes \f$ z = x \odot y \f$, out of place.
+	 * Computes \f$ z = x \odot y \f$, out of place, masked operator variant.
 	 *
-	 * Masked operator version.
+	 * \todo Specify performance semantics.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class OP,
+		Descriptor descr = descriptors::no_operation,
+		class OP,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -4627,6 +5355,22 @@ namespace grb {
 			grb::is_operator< OP >::value, void
 		>::type * const = nullptr
 	) {
+		// static checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+			"called with a left-hand input element type that does not match the "
+			"first domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+			"called with a right-hand input element type that does not match the "
+			"second domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+			"called with an output element type that does not match the "
+			"third domain of the given operator" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+			std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+			"called with an output mask element type that is not Boolean " );
 #ifdef _DEBUG
 		std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using operator)\n";
 #endif
@@ -4635,19 +5379,34 @@ namespace grb {
 			return eWiseApply< descr >( z, x, y, op, phase );
 		}
 
+		// check if can delegate to unmasked variant
+		const auto &m_coors = internal::getCoordinates( mask );
+		const size_t n = m_coors.size();
+		if( m_coors.nonzeroes() == n &&
+			(descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask)
+		) {
+			return eWiseApply< descr >( z, x, y, op );
+		}
+
 		// other run-time checks
 		auto &z_coors = internal::getCoordinates( z );
 		const auto &mask_coors = internal::getCoordinates( mask );
-		const size_t n = z_coors.size();
 		if( internal::getCoordinates( x ).size() != n ) {
 			return MISMATCH;
 		}
 		if( internal::getCoordinates( y ).size() != n ) {
 			return MISMATCH;
 		}
-		if( mask_coors.size() != n ) {
+		if( z_coors.size() != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+			if( nnz( x ) < size( x ) ) { return ILLEGAL; }
+			if( nnz( y ) < size( y ) ) { return ILLEGAL; }
+			if( nnz( mask ) < size( mask ) ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
@@ -4660,18 +5419,9 @@ namespace grb {
 		const InputType2 * const y_p = internal::getRaw( y );
 		const auto &x_coors = internal::getCoordinates( x );
 		const auto &y_coors = internal::getCoordinates( y );
-		const auto &m_coors = internal::getCoordinates( mask );
 		const size_t sparse_loop =
 			std::min( x_coors.nonzeroes(), y_coors.nonzeroes() );
 
-		// check if can delegate to unmasked variant
-		if( m_coors.nonzeroes() == n &&
-			(descr & descriptors::structural) &&
-			!(descr & descriptors::invert_mask)
-		) {
-			return eWiseApply< descr >( z, x, y, op );
-		}
-
 		// the output sparsity structure is unknown a priori
 		z_coors.clear();
 
@@ -4692,47 +5442,8 @@ namespace grb {
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under this semiring.
-	 *
-	 * @tparam descr      The descriptor to be used (descriptors::no_operation
-	 *                    if left unspecified).
-	 * @tparam Ring       The semiring type to perform the element-wise addition
-	 *                    on.
-	 * @tparam InputType1 The left-hand side input type to the additive operator
-	 *                    of the \a ring.
-	 * @tparam InputType2 The right-hand side input type to the additive operator
-	 *                    of the \a ring.
-	 * @tparam OutputType The the result type of the additive operator of the
-	 *                    \a ring.
-	 *
-	 * @param[out]  z  The output vector of type \a OutputType. This may be a
-	 *                 sparse vector.
-	 * @param[in]   x  The left-hand input vector of type \a InputType1. This may
-	 *                 be a sparse vector.
-	 * @param[in]   y  The right-hand input vector of type \a InputType2. This may
-	 *                 be a sparse vector.
-	 * @param[in] ring The generalized semiring under which to perform this
-	 *                 element-wise multiplication.
-	 *
-	 * @return grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
-	 *                       not match. All input data containers are left
-	 *                       untouched; it will be as though this call was never
-	 *                       made.
-	 * @return grb::SUCCESS  On successful completion of this call.
-	 *
-	 * \parblock
-	 * \par Valid descriptors
-	 * grb::descriptors::no_operation, grb::descriptors::no_casting,
-	 * grb::descriptors::dense.
-	 *
-	 * \note Invalid descriptors will be ignored.
-	 *
-	 * If grb::descriptors::no_casting is specified, then 1) the third domain of
-	 * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
-	 * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
-	 * one of these is not true, the code shall not compile.
-	 * \endparblock
+	 * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+	 * under a given semiring.
 	 *
 	 * \parblock
 	 * \par Performance semantics
@@ -4760,17 +5471,10 @@ namespace grb {
 	 *         the input domains, the output domain, and the operator used allow
 	 *         for this.
 	 * \endparblock
-	 *
-	 * @see This is a specialised form of eWiseMulAdd.
-	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
@@ -4801,23 +5505,18 @@ namespace grb {
 			"not match the fourth domain of the given semiring" );
 #ifdef _DEBUG
 		std::cout << "eWiseAdd (reference, vector <- vector + vector) dispatches to "
-			<< "eWiseApply( reference, vector <- vector . vector ) using an "
-			<< "additive monoid\n";
+			<< "two folds using the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, x, y, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a x.
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+	 * given semiring.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation, class Ring,
@@ -4851,25 +5550,22 @@ namespace grb {
 			"third domain of the given semiring" );
 #ifdef _DEBUG
 		std::cout << "eWiseAdd (reference, vector <- scalar + vector) dispatches to "
-			<< "eWiseApply with additive monoid\n";
+			<< "two folds with the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, alpha, y, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a y.
+	 * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+	 * given semiring.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
@@ -4899,25 +5595,23 @@ namespace grb {
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
 #ifdef _DEBUG
-		std::cout << "eWiseAdd (reference, vector <- vector + scalar) dispatches to eWiseApply with additive monoid\n";
+		std::cout << "eWiseAdd (reference, vector <- vector + scalar) dispatches to "
+			<< "two folds with the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, x, beta, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, beta, ring.getAdditiveMonoid(), phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a x and \a y.
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+	 * given semiring.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
@@ -4956,19 +5650,14 @@ namespace grb {
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
+	 * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+	 * under a given semiring, masked variant.
 	 *
-	 * Masked version.
-	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename OutputType, typename MaskType,
 		typename InputType1, typename InputType2,
 		typename Coords
@@ -5006,26 +5695,22 @@ namespace grb {
 			"called with non-bool mask element types" );
 #ifdef _DEBUG
 		std::cout << "eWiseAdd (reference, vector <- vector + vector, masked) "
-			<< "dispatches to eWiseApply( reference, vector <- vector . vector ) using "
-			<< "an additive monoid\n";
+			<< "dispatches to two folds using the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, m, x, y, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a x, masked version
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+	 * given semiring, masked variant.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename OutputType, typename MaskType,
 		typename Coords
@@ -5062,25 +5747,23 @@ namespace grb {
 			"grb::eWiseAdd (vector <- scalar + vector, masked)",
 			"called with non-bool mask element types" );
 #ifdef _DEBUG
-		std::cout << "eWiseAdd (reference, vector <- scalar + vector, masked) dispatches to eWiseApply with additive monoid\n";
+		std::cout << "eWiseAdd (reference, vector <- scalar + vector, masked) "
+			<< "dispatches to two folds using the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, m, alpha, y, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, m, alpha, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a y, masked version.
+	 * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+	 * given semiring, masked variant.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename OutputType, typename MaskType,
 		typename Coords
@@ -5117,25 +5800,24 @@ namespace grb {
 			"grb::eWiseAdd (vector <- vector + scalar, masked)",
 			"called with non-bool mask element types" );
 #ifdef _DEBUG
-		std::cout << "eWiseAdd (reference, vector <- vector + scalar, masked) dispatches to eWiseApply with additive monoid\n";
+		std::cout << "eWiseAdd (reference, vector <- vector + scalar, masked) "
+			<< "dispatches to eWiseApply using the additive monoid\n";
 #endif
-		return eWiseApply< descr >( z, m, x, beta, ring.getAdditiveMonoid(), phase );
+		RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+		ret = ret ? ret : foldl< descr >( z, m, beta, ring.getAdditiveMonoid(),
+			phase );
+		return ret;
 	}
 
 	/**
-	 * Calculates the element-wise addition of two vectors, \f$ z = x .+ y \f$,
-	 * under the given semiring.
-	 *
-	 * Specialisation for scalar \a x and \a y, masked version.
+	 * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+	 * given semiring, masked variant.
 	 *
-	 * \warning This primitive has been deprecated since version 0.5. Please update
-	 *          any use of this operation to an equivalent one using a sequence of
-	 *          folds using the additive monoid if \a z is used in-place, or in the
-	 *          case of out-of-place use of \a z by a call to grb::eWiseApply using
-	 *          the additive monoid.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename OutputType, typename MaskType,
 		typename Coords
@@ -5194,7 +5876,7 @@ namespace grb {
 		 */
 		template<
 			Descriptor descr,
-			bool a_scalar, bool x_scalar, bool y_scalar,
+			bool a_scalar, bool x_scalar, bool y_scalar, bool y_zero,
 			typename OutputType, typename MaskType,
 			typename InputType1, typename InputType2, typename InputType3,
 			typename CoorsType, class Ring
@@ -5218,6 +5900,8 @@ namespace grb {
 			static_assert( !a_scalar || !x_scalar,
 				"If both a and x are scalars, this is operation is a simple eWiseApply "
 				"with the additive operator if the semiring." );
+			static_assert( !y_zero || y_scalar,
+				"If y_zero is given, then y_scalar must be given also." );
 			OutputType * __restrict__ z = internal::getRaw( z_vector );
 			auto &z_coors = internal::getCoordinates( z_vector );
 			assert( z != a );
@@ -5450,7 +6134,7 @@ namespace grb {
 				for( ; k < end; ++k ) {
 					const size_t index = m_coors->index( k );
 					assert( index < n );
-					if( ! ( m_coors->template mask< descr >( index, m ) ) ) {
+					if( !( m_coors->template mask< descr >( index, m )) ) {
 						continue;
 					}
 					typename Ring::D3 t = ring.template getZero< typename Ring::D3 >();
@@ -5460,7 +6144,7 @@ namespace grb {
 						const InputType1 * const a_p = a + ( a_scalar ? 0 : index );
 						const InputType2 * const x_p = x + ( x_scalar ? 0 : index );
 						(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
-						if( y_scalar || y_coors->assigned( index ) ) {
+						if( !y_zero && (y_scalar || y_coors->assigned( index )) ) {
 							const InputType3 * const y_p = y + ( y_scalar ? 0 : index );
 							typename Ring::D4 b;
 							(void) apply( b, t, *y_p, ring.getAdditiveOperator() );
@@ -5490,7 +6174,7 @@ namespace grb {
 #endif
 							z[ index ] = static_cast< OutputType >( t );
 						}
-					} else if( y_coors->assigned( index ) ) {
+					} else if( !y_zero && (y_scalar || y_coors->assigned( index )) ) {
 						if( z_coors.assigned( index ) ) {
 							typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
 							(void) foldr( y[ index ], out, ring.getAdditiveOperator() );
@@ -5532,7 +6216,7 @@ namespace grb {
 		 */
 		template<
 			Descriptor descr,
-			bool masked, bool a_scalar, bool x_scalar, bool y_scalar,
+			bool masked, bool x_scalar, bool y_scalar, bool y_zero, bool mulSwitched,
 			typename OutputType, typename MaskType,
 			typename InputType1, typename InputType2, typename InputType3,
 			typename CoorsType, class Ring
@@ -5541,28 +6225,26 @@ namespace grb {
 			Vector< OutputType, reference, CoorsType > &z_vector,
 			const Vector< MaskType, reference, CoorsType > * const m_vector,
 			const InputType1 * __restrict__ a,
-			const CoorsType * const a_coors,
+			const CoorsType &it_coors,
 			const InputType2 * __restrict__ x,
-			const CoorsType * const x_coors,
+			const CoorsType * const ck_coors,
 			const Vector< InputType3, reference, CoorsType > * const y_vector,
 			const InputType3 * __restrict__ y,
 			const size_t n,
 			const Ring &ring = Ring()
 		) {
-			static_assert( !a_scalar || !x_scalar, "If both a and x are scalars, this "
-				"operation is a simple eWiseApply with the additive operator of the "
-				"semiring." );
 			InputType3 * __restrict__ z = internal::getRaw( z_vector );
 			auto &z_coors = internal::getCoordinates( z_vector );
 			assert( z != a );
 			assert( z != x );
 			assert( a != x );
-
-			const size_t a_loop_size = a_scalar ? n : a_coors->nonzeroes();
-			const size_t x_loop_size = x_scalar ? n : x_coors->nonzeroes();
-			assert( a_loop_size < n || x_loop_size < n );
-			const auto &it_coors = a_loop_size < x_loop_size ? *a_coors : *x_coors;
-			const auto &ck_coors = a_loop_size < x_loop_size ? *x_coors : *a_coors;
+			assert( n == z_coors.size() );
+			assert( n == it_coors.size() );
+			assert( x_scalar || ck_coors->size() == n );
+			assert( y_scalar || (size(*y_vector) == n) );
+#ifdef NDEBUG
+			(void) n;
+#endif
 
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
 			#pragma omp parallel
@@ -5584,53 +6266,62 @@ namespace grb {
 							continue;
 						}
 					}
-					if( ck_coors.assigned( index ) ) {
+					if( x_scalar || ck_coors->assigned( index ) ) {
 						typename Ring::D3 t;
-						const InputType1 * const a_p = a_scalar ? a : a + index;
+						const InputType1 * const a_p = a + index;
 						const InputType2 * const x_p = x_scalar ? x : x + index;
-						(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
+						if( mulSwitched ) {
+							(void) apply( t, *x_p, *a_p, ring.getMultiplicativeOperator() );
+						} else {
+							(void) apply( t, *a_p, *x_p, ring.getMultiplicativeOperator() );
+						}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
 						if( z_coors.asyncAssign( index, localUpdate ) ) {
 #else
-							if( z_coors.assign( index ) ) {
+						if( z_coors.assign( index ) ) {
 #endif
-								typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] );
-								(void)foldr( t, b, ring.getAdditiveOperator() );
-								z[ index ] = static_cast< OutputType >( b );
-							} else {
-								z[ index ] = static_cast< OutputType >(
-									static_cast< typename Ring::D4 >( t )
-								);
+							typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] );
+							(void) foldr( t, b, ring.getAdditiveOperator() );
+							z[ index ] = static_cast< OutputType >( b );
+						} else {
+							z[ index ] = static_cast< OutputType >(
+								static_cast< typename Ring::D4 >( t )
+							);
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
-								(void) ++asyncAssigns;
-								if( asyncAssigns == maxAsyncAssigns ) {
-									(void) z_coors.joinUpdate( localUpdate );
-									asyncAssigns = 0;
-								}
-#endif
+							(void) ++asyncAssigns;
+							if( asyncAssigns == maxAsyncAssigns ) {
+								(void) z_coors.joinUpdate( localUpdate );
+								asyncAssigns = 0;
 							}
+#endif
 						}
 					}
+				}
 #ifdef _H_GRB_REFERENCE_OMP_BLAS1
 				while( !z_coors.joinUpdate( localUpdate ) ) {}
 			}
 #endif
 
 			// now handle addition
-			if( masked ) {
-				if( y_scalar ) {
-					return foldl< descr >( z_vector, *m_vector, *y, ring.getAdditiveMonoid() );
-				} else {
-					return foldl< descr >( z_vector, *m_vector, *y_vector,
-						ring.getAdditiveMonoid() );
-				}
-			} else {
-				if( y_scalar ) {
-					return foldl< descr >( z_vector, *y, ring.getAdditiveMonoid() );
+			if( !y_zero ) {
+				if( masked ) {
+					if( y_scalar ) {
+						return foldl< descr >( z_vector, *m_vector, *y, ring.getAdditiveMonoid() );
+					} else {
+						return foldl< descr >( z_vector, *m_vector, *y_vector,
+							ring.getAdditiveMonoid() );
+					}
 				} else {
-					return foldl< descr >( z_vector, *y_vector, ring.getAdditiveMonoid() );
+					if( y_scalar ) {
+						return foldl< descr >( z_vector, *y, ring.getAdditiveMonoid() );
+					} else {
+						return foldl< descr >( z_vector, *y_vector, ring.getAdditiveMonoid() );
+					}
 				}
 			}
+
+			// done
+			return SUCCESS;
 		}
 
 		/**
@@ -5648,7 +6339,7 @@ namespace grb {
 		 */
 		template<
 			Descriptor descr,
-			bool a_scalar, bool x_scalar, bool y_scalar, bool assign_z,
+			bool a_scalar, bool x_scalar, bool y_scalar, bool y_zero, bool assign_z,
 			typename OutputType, typename InputType1,
 			typename InputType2, typename InputType3,
 			typename CoorsType, class Ring
@@ -5706,8 +6397,14 @@ namespace grb {
 					}
 				}
 				if( y_scalar ) {
-					for( size_t b = 0; b < Ring::blocksize; ++b ) {
-						yy[ b ] = *y;
+					if( y_zero ) {
+						for( size_t b = 0; b < Ring::blocksize; ++b ) {
+							yy[ b ] = ring.template getZero< typename Ring::D4 >();
+						}
+					} else {
+						for( size_t b = 0; b < Ring::blocksize; ++b ) {
+							yy[ b ] = *y;
+						}
 					}
 				}
 
@@ -5744,9 +6441,16 @@ namespace grb {
 					}
 
 					// operate
-					for( size_t b = 0; b < Ring::blocksize; ++b ) {
-						apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
-						apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
+					if( !y_zero ) {
+						for( size_t b = 0; b < Ring::blocksize; ++b ) {
+							apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+							apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
+						}
+					} else {
+						assert( y_scalar );
+						for( size_t b = 0; b < Ring::blocksize; ++b ) {
+							apply( bb[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+						}
 					}
 					if( !assign_z ) {
 						for( size_t b = 0; b < Ring::blocksize; ++b ) {
@@ -5779,26 +6483,35 @@ namespace grb {
 				z += i;
 				for( ; i < end; ++i ) {
 					// do multiply
-					const typename Ring::D1 & as = static_cast< typename Ring::D1 >( *a );
-					const typename Ring::D2 & xs = static_cast< typename Ring::D2 >( *x );
+					const typename Ring::D1 &as = static_cast< typename Ring::D1 >( *a );
+					const typename Ring::D2 &xs = static_cast< typename Ring::D2 >( *x );
 					typename Ring::D4 ys = static_cast< typename Ring::D4 >( *y );
 					typename Ring::D3 ts;
-#ifndef NDEBUG
-					const RC always_succeeds =
-#else
-					(void)
+					if( !y_zero ) {
+						RC always_succeeds = apply( ts, as, xs, ring.getMultiplicativeOperator() );
+						assert( always_succeeds == SUCCESS );
+						always_succeeds = foldr( ts, ys, ring.getAdditiveOperator() );
+						assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+						(void) always_succeeds;
 #endif
-						apply( ts, as, xs, ring.getMultiplicativeOperator() );
-					assert( always_succeeds == SUCCESS );
-
-					// do add
-					foldr( ts, ys, ring.getAdditiveOperator() );
+					} else {
+						RC always_succeeds = apply( ys, as, xs, ring.getMultiplicativeOperator() );
+						assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+						(void) always_succeeds;
+#endif
+					}
 
 					// write out
 					if( assign_z ) {
 						*z = static_cast< OutputType >( ys );
 					} else {
-						foldr( ys, *z, ring.getAdditiveOperator() );
+						RC always_succeeds = foldr( ys, *z, ring.getAdditiveOperator() );
+						assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+						(void) always_succeeds;
+#endif
 					}
 
 					// move pointers
@@ -5844,7 +6557,7 @@ namespace grb {
 		 */
 		template<
 			Descriptor descr,
-			bool masked, bool a_scalar, bool x_scalar, bool y_scalar,
+			bool masked, bool a_scalar, bool x_scalar, bool y_scalar, bool y_zero,
 			typename MaskType, class Ring,
 			typename InputType1, typename InputType2,
 			typename InputType3, typename OutputType,
@@ -5863,6 +6576,12 @@ namespace grb {
 			const size_t n,
 			const Ring &ring
 		) {
+			static_assert( !y_zero || y_scalar, "If y is zero, y_scalar must be true. "
+				"Triggering this assertion indicates an incorrect call to this "
+				"function; please submit a bug report" );
+#ifdef _DEBUG
+			std::cout << "\t in eWiseMulAdd_dispatch\n";
+#endif
 			const MaskType * __restrict__ m = nullptr;
 			const CoorsType * m_coors = nullptr;
 			assert( !masked || ( m_vector != nullptr ) );
@@ -5893,19 +6612,17 @@ namespace grb {
 				( x_scalar ? false : ( x_coors->nonzeroes() < n ) ) ||
 				( y_scalar ? false : ( y_coors->nonzeroes() < n ) ) ||
 				( z_nns > 0 && z_nns < n ) ||
-				( masked && ! mask_is_dense );
-			if( dense && sparse ) {
-				std::cerr << "Warning: call to grb::eWiseMulAdd (reference) with a "
-					"dense descriptor but sparse input arguments. "
-					"Returning ILLEGAL\n";
-				return ILLEGAL;
-			}
-
+				( masked && !mask_is_dense );
+			assert( !(sparse && dense) );
+#ifdef _DEBUG
+			std::cout << "\t\t (sparse, dense)=(" << sparse << ", "
+				<< dense << ")\n";
+#endif
 			// pre-assign coors if output is dense but was previously totally empty
 			const bool assign_z = z_nns == 0 && !sparse;
 			if( assign_z ) {
 #ifdef _DEBUG
-				std::cout << "\teWiseMulAdd_dispatch: detected output will be dense while "
+				std::cout << "\t\t detected output will be dense while "
 					<< "the output vector presently is completely empty. We therefore "
 					<< "pre-assign all output coordinates\n";
 #endif
@@ -5919,7 +6636,9 @@ namespace grb {
 				const size_t mul_loop_size = ( 3 + mask_factor ) * std::min(
 						( a_scalar ? n : a_coors->nonzeroes() ),
 						( x_scalar ? n : x_coors->nonzeroes() )
-					) + ( 2 + mask_factor ) * ( y_scalar ? n : y_coors->nonzeroes() );
+					) + ( y_zero ? 0 :
+						(2 + mask_factor) * ( y_scalar ? n : y_coors->nonzeroes() )
+					);
 				/** See internal issue #42 (closed): this variant, in a worst-case analysis
 				 * is never preferred:
 				const size_t add_loop_size = (4 + mask_factor) *
@@ -5929,42 +6648,60 @@ namespace grb {
 				        (x_scalar ? n : x_coors->nonzeroes())
 				    ) // min is worst-case, best case is 0, realistic is some a priori unknown
 				      // problem-dependent overlap
-				std::cout << "\t\teWiseMulAdd_dispatch: add_loop_size = " << add_loop_size
+				std::cout << "\t\t add_loop_size = " << add_loop_size
 					<< "\n";
 				;*/
 #ifdef _DEBUG
-				std::cout << "\t\teWiseMulAdd_dispatch: mul_loop_size = " << mul_loop_size
+				std::cout << "\t\t mul_loop_size = " << mul_loop_size
 					<< "\n";
 #endif
 				if( masked ) {
-					const size_t mask_loop_size = 5 * m_coors->nonzeroes();
+					const size_t mask_loop_size = ( y_zero ? 4 : 5 ) * m_coors->nonzeroes();
 #ifdef _DEBUG
-					std::cout << "\t\teWiseMulAdd_dispatch: mask_loop_size= "
+					std::cout << "\t\t mask_loop_size= "
 						<< mask_loop_size << "\n";
 #endif
 					// internal issue #42 (closed):
 					// if( mask_loop_size < mul_loop_size && mask_loop_size < add_loop_size ) {
 					if( mask_loop_size < mul_loop_size ) {
 #ifdef _DEBUG
-						std::cout << "\teWiseMulAdd_dispatch: will be driven by output mask\n";
+						std::cout << "\t\t will be driven by output mask\n";
 #endif
 						return sparse_eWiseMulAdd_maskDriven<
-							descr, a_scalar, x_scalar, y_scalar
+							descr, a_scalar, x_scalar, y_scalar, y_zero
 						>( z_vector, m, m_coors, a, a_coors, x, x_coors, y, y_coors, n, ring );
 					}
 				}
 				// internal issue #42 (closed), see also above:
 				// if( mul_loop_size < add_loop_size ) {
 #ifdef _DEBUG
-				std::cout << "\teWiseMulAdd_dispatch: will be driven by the multiplication a*x\n";
+				std::cout << "\t\t will be driven by the multiplication a*x\n";
 #endif
-				return twoPhase_sparse_eWiseMulAdd_mulDriven<
-					descr, masked, a_scalar, x_scalar, y_scalar
-				>( z_vector, m_vector, a, a_coors, x, x_coors, y_vector, y, n, ring );
+				static_assert( !(a_scalar && x_scalar),
+					"The case of the multiplication being between two scalars should have"
+					"been caught earlier. Please submit a bug report." );
+				if( a_scalar ) {
+					return twoPhase_sparse_eWiseMulAdd_mulDriven<
+						descr, masked, a_scalar, y_scalar, y_zero, true
+					>( z_vector, m_vector, x, *x_coors, a, a_coors, y_vector, y, n, ring );
+				} else if( x_scalar ) {
+					return twoPhase_sparse_eWiseMulAdd_mulDriven<
+						descr, masked, x_scalar, y_scalar, y_zero, false
+					>( z_vector, m_vector, a, *a_coors, x, x_coors, y_vector, y, n, ring );
+				} else if( a_coors->nonzeroes() <= x_coors->nonzeroes() ) {
+					return twoPhase_sparse_eWiseMulAdd_mulDriven<
+						descr, masked, x_scalar, y_scalar, y_zero, false
+					>( z_vector, m_vector, a, *a_coors, x, x_coors, y_vector, y, n, ring );
+				} else {
+					assert( x_coors->nonzeroes() < a_coors->nonzeroes() );
+					return twoPhase_sparse_eWiseMulAdd_mulDriven<
+						descr, masked, a_scalar, y_scalar, y_zero, true
+					>( z_vector, m_vector, x, *x_coors, a, a_coors, y_vector, y, n, ring );
+				}
 				/* internal issue #42 (closed), see also above:
 				} else {
 #ifdef _DEBUG
-				    std::cout << "\teWiseMulAdd_dispatch: will be driven by the addition with y\n";
+				    std::cout << "\t\t will be driven by the addition with y\n";
 #endif
 				    if( assign_z ) {
 				        return twoPhase_sparse_eWiseMulAdd_addPhase1<
@@ -5999,15 +6736,15 @@ namespace grb {
 			assert( ! masked || mask_is_dense );
 			assert( internal::getCoordinates( z_vector ).nonzeroes() == n );
 #ifdef _DEBUG
-			std::cout << "\teWiseMulAdd_dispatch: will perform a dense eWiseMulAdd\n";
+			std::cout << "\t\t will perform a dense eWiseMulAdd\n";
 #endif
 			if( assign_z ) {
 				return dense_eWiseMulAdd<
-					descr, a_scalar, x_scalar, y_scalar, true
+					descr, a_scalar, x_scalar, y_scalar, y_zero, true
 				>( z_vector, a, x, y, n, ring );
 			} else {
 				return dense_eWiseMulAdd<
-					descr, a_scalar, x_scalar, y_scalar, false
+					descr, a_scalar, x_scalar, y_scalar, y_zero, false
 				>( z_vector, a, x, y, n, ring );
 			}
 		}
@@ -6024,16 +6761,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
+		Vector< OutputType, reference, Coords > &z,
 		const InputType1 alpha,
-		const Vector< InputType2, reference, Coords > &_x,
-		const Vector< InputType3, reference, Coords > &_y,
+		const Vector< InputType2, reference, Coords > &x,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
@@ -6045,70 +6783,83 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n ||
-			internal::getCoordinates( _y ).size() != n
-		) {
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( z ) < n ) { return ILLEGAL; }
+		}
+
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
-		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
+		assert( phase == EXECUTE );
 		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
-		if( alpha == zeroIT1 || internal::getCoordinates( _x ).nonzeroes() == 0 ) {
-			return foldl< descr >( _z, _y, ring.getAdditiveMonoid() );
+		if( alpha == zeroIT1 || nnz( x ) == 0 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
 		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
+		if( nnz( y ) == 0 ) {
 			return eWiseMulAdd< descr >(
-				_z, alpha, _x, ring.template getZero< typename Ring::D4 >(), ring );
+				z, alpha, x,
+				ring.template getZero< typename Ring::D4 >(),
+				ring
+			);
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
-		auto null_coors = &( internal::getCoordinates( _x ) );
+		auto null_coors = &( internal::getCoordinates( x ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _x ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n ) {
+			if( nnz( z ) == n && nnz( x ) == n && nnz( y ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, true, false, false
+					descr | descriptors::dense, false, true, false, false, false
 				>(
-					_z, null_mask, &alpha, null_coors,
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					z, null_mask, &alpha, null_coors,
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
 
 		// sparse or dense case
-		return internal::eWiseMulAdd_dispatch< descr, false, true, false, false >(
-			_z, null_mask, &alpha, null_coors,
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, true, false, false, false
+		>(
+			z, null_mask, &alpha, null_coors,
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -6124,16 +6875,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< InputType1, reference, Coords > &_a,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< InputType1, reference, Coords > &a,
 		const InputType2 chi,
-		const Vector< InputType3, reference, Coords > &_y,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
@@ -6145,71 +6897,85 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n || internal::getCoordinates( _y ).size() != n ) {
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( z ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
-		if( chi == zeroIT2 || internal::getCoordinates( _a ).nonzeroes() == 0 ) {
-			return foldl< descr >( _z, _y, ring.getAdditiveMonoid() );
+		if( chi == zeroIT2 || nnz( a ) == 0 ) {
+			return foldl< descr >( z, y, ring.getAdditiveMonoid() );
 		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, chi,
-				ring.template getZero< typename Ring::D4 >(), ring );
+		if( nnz( y ) == 0 ) {
+			return eWiseMulAdd< descr >(
+				z, a, chi,
+				ring.template getZero< typename Ring::D4 >(),
+				ring
+			);
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n ) {
+			if( nnz( z ) == n && nnz( a ) == n && nnz( y ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, true, false
+					descr | descriptors::dense, false, false, true, false, false
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 					&chi, null_coors,
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
 
 		// sparse or dense case
-		return internal::eWiseMulAdd_dispatch< descr, false, false, true, false >(
-			_z, null_mask,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, false, true, false, false
+		>(
+			z, null_mask,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 			&chi, null_coors,
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -6224,15 +6990,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< InputType1, reference, Coords > &_a,
-		const Vector< InputType2, reference, Coords > &_x,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< InputType1, reference, Coords > &a,
+		const Vector< InputType2, reference, Coords > &x,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
@@ -6245,73 +7013,81 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n ||
-			internal::getCoordinates( _x ).size() != n
-		) {
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( z ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType3 zeroIT3 = ring.template getZero< InputType3 >();
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 ||
-			internal::getCoordinates( _x ).nonzeroes() == 0 ) {
-			return foldl< descr >( _z, gamma, ring.getAdditiveMonoid() );
-		}
-		if( gamma == zeroIT3 ) {
-			return eWiseMulAdd< descr >( _z, _a, _x,
-				ring.template getZero< typename Ring::D4 >(), ring );
+		if( nnz( a ) == 0 || nnz( x ) == 0 ) {
+			if( gamma == zeroIT3 ) {
+				return SUCCESS;
+			} else {
+				return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+			}
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		const Vector< InputType3, reference, Coords > * const null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n &&
-				internal::getCoordinates( _x ).nonzeroes() == n ) {
+			if( nnz( z ) == n && nnz( a ) == n && nnz( x ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, false, true
+					descr | descriptors::dense, false, false, false, true, y_zero
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 					null_y, &gamma, null_coors, n, ring
 				);
 			}
 		}
 
 		// sparse or dense case
-		return internal::eWiseMulAdd_dispatch< descr, false, false, false, true >(
-			_z, null_mask,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, false, false, true, y_zero
+		>(
+			z, null_mask,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 			null_y, &gamma, null_coors, n, ring
 		);
 	}
@@ -6326,14 +7102,16 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< InputType1, reference, Coords > &_a,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< InputType1, reference, Coords > &a,
 		const InputType2 beta,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
@@ -6347,70 +7125,73 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n ) {
+		const size_t n = size( z );
+		if( size( a ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+		}
 
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
-		const InputType3 zeroIT3 = ring.template getZero< InputType3 >();
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 || zeroIT2 ) {
-			return foldl< descr >( _z, gamma, ring.getAdditiveMonoid() );
-		}
-		if( gamma == zeroIT3 ) {
-			return eWiseMulAdd< descr >(
-				_z, _a, beta,
-				ring.template getZero< typename Ring::D4 >(),
-				ring );
+		if( nnz( a ) == 0 || beta == zeroIT2 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
 		}
 
 		// check for density
 		Vector< bool, reference, Coords > * const null_mask = nullptr;
 		Vector< InputType3, reference, Coords > * const null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n ) {
+			if( nnz( z ) == n && nnz( a ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, true, true
+					descr | descriptors::dense, false, false, true, true, y_zero
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 					&beta, null_coors, null_y, &gamma, null_coors, n, ring
 				);
 			}
 		}
 
 		// sparse or dense case
-		return internal::eWiseMulAdd_dispatch< descr, false, false, true, true >(
-			_z, null_mask,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, false, true, true, y_zero
+		>(
+			z, null_mask,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 			&beta, null_coors, null_y, &gamma, null_coors, n, ring
 		);
 	}
@@ -6425,15 +7206,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
+		Vector< OutputType, reference, Coords > &z,
 		const InputType1 alpha,
-		const Vector< InputType2, reference, Coords > &_x,
+		const Vector< InputType2, reference, Coords > &x,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
@@ -6446,63 +7229,74 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n ) {
+		const size_t n = size( z );
+		if( size( x ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
-		if( internal::getCoordinates( _x ).nonzeroes() == 0 || alpha == zeroIT1 ) {
-			return foldl< descr >( _z, gamma, ring.getAdditiveMonoid() );
+		if( nnz( x ) == 0 || alpha == zeroIT1 ) {
+			return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * null_mask = nullptr;
 		const Vector< InputType3, reference, Coords > * null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _x ) );
+		auto null_coors = &( internal::getCoordinates( x ) );
 		null_coors = nullptr;
-		constexpr bool maybe_sparse = ! (descr & descriptors::dense);
+		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _x ).nonzeroes() == n ) {
+			if( nnz( z ) == n && nnz( x ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, true, false, true
+					descr | descriptors::dense, false, true, false, true, y_zero
 				>(
-					_z, null_mask, &alpha, null_coors,
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+					z, null_mask, &alpha, null_coors,
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 					null_y, &gamma, null_coors, n, ring
 				);
 			}
 		}
 
 		// sparse or dense case
-		return internal::eWiseMulAdd_dispatch< descr, false, true, false, true >(
-			_z, null_mask, &alpha, null_coors,
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, true, false, true, y_zero
+		>(
+			z, null_mask, &alpha, null_coors,
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 			null_y, &gamma, null_coors, n, ring
 		);
 	}
@@ -6539,19 +7333,19 @@ namespace grb {
 		>::type * const = nullptr
 	) {
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ),
+				std::is_same< typename Ring::D1, InputType1 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"First domain of semiring does not match first input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ),
+				std::is_same< typename Ring::D2, InputType2 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Second domain of semiring does not match second input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ),
+				std::is_same< typename Ring::D4, InputType3 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match third input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ),
+				std::is_same< typename Ring::D4, OutputType >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match output type" );
 #ifdef _DEBUG
@@ -6559,14 +7353,25 @@ namespace grb {
 			<< "precomputes scalar multiply and dispatches to eWiseAdd (reference, "
 			<< "vector <- scalar + vector)\n";
 #endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+		}
 		typename Ring::D3 mul_result;
-		RC rc = grb::apply( mul_result, alpha, beta, ring.getMultiplicativeOperator() );
+		RC rc = grb::apply(
+			mul_result,
+			alpha, beta,
+			ring.getMultiplicativeOperator()
+		);
 #ifdef NDEBUG
 		(void) rc;
 #else
 		assert( rc == SUCCESS );
 #endif
-		return grb::eWiseAdd( z, mul_result, y, ring, phase );
+		return eWiseAdd< descr >( z, mul_result, y, ring, phase );
 	}
 
 	/**
@@ -6620,6 +7425,11 @@ namespace grb {
 		std::cout << "eWiseMulAdd (reference, vector <- scalar x scalar + scalar) "
 			<< "precomputes scalar operations and dispatches to set (reference)\n";
 #endif
+		// dynamic sanity checks
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+		}
+
 		typename Ring::D3 mul_result;
 		RC rc = grb::apply( mul_result, alpha, beta,
 			ring.getMultiplicativeOperator() );
@@ -6634,7 +7444,7 @@ namespace grb {
 		(void) rc;
 #endif
 		assert( rc == SUCCESS );
-		return grb::set( z, add_result, phase );
+		return grb::foldl< descr >( z, add_result, ring.getAdditiveMonoid(), phase );
 	}
 
 	/**
@@ -6721,16 +7531,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2,
 		typename InputType3, typename OutputType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< InputType1, reference, Coords > &_a,
-		const Vector< InputType2, reference, Coords > &_x,
-		const Vector< InputType3, reference, Coords > &_y,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< InputType1, reference, Coords > &a,
+		const Vector< InputType2, reference, Coords > &x,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
@@ -6743,73 +7554,81 @@ namespace grb {
 		(void) ring;
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand vector _a with an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 
-		// catch trivial cases
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 ) {
-			return foldr< descr >( _y, _z, ring.getAdditiveMonoid(), phase );
-		}
-		if( internal::getCoordinates( _x ).nonzeroes() == 0 ) {
-			return foldr< descr >( _y, _z, ring.getAdditiveMonoid(), phase );
-		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, _x,
-				ring.template getZero< typename Ring::D4 >(), ring, phase );
-		}
-
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n ||
-			internal::getCoordinates( _y ).size() != n ||
-			internal::getCoordinates( _a ).size() != n
-		) {
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
+
+		// catch trivial dispatches
+		if( nnz( a ) == 0 || nnz( x ) == 0 ) {
+			return foldr< descr >( y, z, ring.getAdditiveMonoid(), phase );
+		}
+		if( nnz( y ) == 0 ) {
+			return eWiseMulAdd< descr >(
+				z, a, x,
+				ring.template getZero< typename Ring::D4 >(),
+				ring, phase
+			);
+		}
+
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check for dense variant
-			if( internal::getCoordinates( _x ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n &&
-				internal::getCoordinates( _a ).nonzeroes() == n
-			) {
+			if( nnz( z ) == n && nnz( x ) == n && nnz( y ) == n && nnz( a ) == n ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, false, false
+					descr | descriptors::dense, false, false, false, false, false
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
-		return internal::eWiseMulAdd_dispatch< descr, false, false, false, false >(
-			_z, null_mask,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, false, false, false, false, false
+		>(
+			z, null_mask,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -6824,17 +7643,18 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
 		const InputType1 alpha,
-		const Vector< InputType2, reference, Coords > &_x,
-		const Vector< InputType3, reference, Coords > &_y,
+		const Vector< InputType2, reference, Coords > &x,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
@@ -6847,86 +7667,97 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, alpha, _x, _y, ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n ||
-			internal::getCoordinates( _y ).size() != n ||
-			m != n
-		) {
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
-		if( alpha == zeroIT1 || internal::getCoordinates( _x ).nonzeroes() == 0 ) {
-			return foldl< descr >( _z, _m, _y, ring.getAdditiveMonoid() );
+		if( alpha == zeroIT1 || nnz( x ) == 0 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
 		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
-			return eWiseMulAdd< descr >( _z, _m, alpha, _x,
-				ring.template getZero< typename Ring::D4 >(), ring );
+		if( nnz( y ) == 0 ) {
+			return eWiseMulAdd< descr >(
+				z, m, alpha, x,
+				ring.template getZero< typename Ring::D4 >(),
+				ring
+			);
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
-		auto null_coors = &( internal::getCoordinates( _x ) );
+		auto null_coors = &( internal::getCoordinates( x ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _x ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( x ) == n && nnz( y ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, true, false, false
+					descr | descriptors::dense, false, true, false, false, false
 				>(
-					_z, null_mask, &alpha, null_coors,
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					z, null_mask, &alpha, null_coors,
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
 
 		// sparse case
-		return internal::eWiseMulAdd_dispatch< descr, true, true, false, false >(
-			_z, &_m, &alpha, null_coors,
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, true, false, false, false
+		>(
+			z, &m, &alpha, null_coors,
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -6942,17 +7773,18 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
-		const Vector< InputType1, reference, Coords > &_a,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
+		const Vector< InputType1, reference, Coords > &a,
 		const InputType2 chi,
-		const Vector< InputType3, reference, Coords > &_y,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
@@ -6966,88 +7798,99 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, chi, _y, ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n ||
-			internal::getCoordinates( _y ).size() != n ||
-			m != n
-		) {
+		const size_t n = size( z );
+		if( size( a ) != n || size( y ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial case
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatches
 		const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
-		if( chi == zeroIT2 || internal::getCoordinates( _a ).nonzeroes() == 0 ) {
-			return foldl< descr >( _z, _m, _y, ring.getAdditiveMonoid() );
+		if( chi == zeroIT2 || nnz( a ) == 0 ) {
+			return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
 		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
-			return eWiseMulAdd< descr >( _z, _m, _a, chi,
-				ring.template getZero< typename Ring::D4 >(), ring );
+		if( nnz( y ) == 0 ) {
+			return eWiseMulAdd< descr >(
+				z, m, a, chi,
+				ring.template getZero< typename Ring::D4 >(),
+				ring
+			);
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( a ) == n && nnz( y ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, true, false
+					descr | descriptors::dense, false, false, true, false, false
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 					&chi, null_coors,
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
 
 		// sparse case
-		return internal::eWiseMulAdd_dispatch< descr, true, false, true, false >(
-			_z, &_m,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, false, true, false, false
+		>(
+			z, &m,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 			&chi, null_coors,
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -7062,16 +7905,18 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
-		const Vector< InputType1, reference, Coords > &_a,
-		const Vector< InputType2, reference, Coords > &_x,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
+		const Vector< InputType1, reference, Coords > &a,
+		const Vector< InputType2, reference, Coords > &x,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
@@ -7085,76 +7930,78 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, _x, gamma, ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, x, gamma, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n || internal::getCoordinates( _x ).size() != n || m != n ) {
+		const size_t n = size( z );
+		if( size( a ) != n || size( x ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
-		const InputType3 zeroIT3 = ring.template getZero< InputType3 >();
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 ||
-			internal::getCoordinates( _x ).nonzeroes() == 0
-		) {
-			return foldl< descr >( _z, _m, gamma, ring.getAdditiveMonoid() );
-		}
-		if( gamma == zeroIT3 ) {
-			return eWiseMulAdd< descr >( _z, _m, _a, _x,
-				ring.template getZero< typename Ring::D4 >(), ring );
+		// catch trivial dispatches
+		if( nnz( a ) == 0 || nnz( x ) == 0 ) {
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		const Vector< InputType3, reference, Coords > * const null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n &&
-				internal::getCoordinates( _x ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( a ) == n && nnz( x ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, false, true
+					descr | descriptors::dense, false, false, false, true, y_zero
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 					null_y, &gamma, null_coors,
 					n, ring
 				);
@@ -7162,10 +8009,12 @@ namespace grb {
 		}
 
 		// sparse case
-		return internal::eWiseMulAdd_dispatch< descr, true, false, false, true >(
-			_z, &_m,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, false, false, true, y_zero
+		>(
+			z, &m,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 			null_y, &gamma, null_coors,
 			n, ring
 		);
@@ -7181,15 +8030,17 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
-		const Vector< InputType1, reference, Coords > &_a,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
+		const Vector< InputType1, reference, Coords > &a,
 		const InputType2 beta,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
@@ -7204,76 +8055,81 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, beta, gamma, ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, a, beta, gamma, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _a ).size() != n || m != n ) {
+		const size_t n = size( z );
+		if( size( a ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatch
 		const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
-		const InputType3 zeroIT3 = ring.template getZero< InputType3 >();
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 || zeroIT2 ) {
-			return foldl< descr >( _z, _m, gamma, ring.getAdditiveMonoid() );
-		}
-		if( gamma == zeroIT3 ) {
-			return eWiseMulAdd< descr >(
-				_z, _m, _a, beta,
-				ring.template getZero< typename Ring::D4 >(),
-				ring
-			);
+		if( nnz( a ) == 0 || zeroIT2 == beta ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (reference, masked, vector<-vector<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * null_mask = nullptr;
 		const Vector< InputType3, reference, Coords > * null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _a ) );
+		auto null_coors = &( internal::getCoordinates( a ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _a ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( a ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, true, true
+					descr | descriptors::dense, false, false, true, true, y_zero
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 					&beta, null_coors, null_y, &gamma, null_coors,
 					n, ring
 				);
@@ -7281,9 +8137,11 @@ namespace grb {
 		}
 
 		// sparse case
-		return internal::eWiseMulAdd_dispatch< descr, true, false, true, true >(
-			_z, &_m,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, false, true, true, y_zero
+		>(
+			z, &m,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
 			&beta, null_coors, null_y, &gamma, null_coors,
 			n, ring
 		);
@@ -7299,16 +8157,18 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		bool y_zero = false,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
 		const InputType1 alpha,
-		const Vector< InputType2, reference, Coords > &_x,
+		const Vector< InputType2, reference, Coords > &x,
 		const InputType3 gamma,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
@@ -7322,68 +8182,81 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand scalar alpha of an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, alpha, _x, gamma, ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr, y_zero >( z, alpha, x, gamma, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n || m != n ) {
+		const size_t n = size( z );
+		if( size( x ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
-		// catch trivial cases
+		// catch trivial dispatch
 		const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
-		if( internal::getCoordinates( _x ).nonzeroes() == 0 || alpha == zeroIT1 ) {
-			return foldl< descr >( _z, _m, gamma, ring.getAdditiveMonoid() );
+		if( nnz( x ) == 0 || alpha == zeroIT1 ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMulAdd (reference, masked, vector<-scalar<-scalar<-"
+				<< "scalar) dispatches to foldl\n";
+#endif
+			return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
 		}
 
 		// check for density
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		const Vector< InputType3, reference, Coords > * const null_y = nullptr;
-		auto null_coors = &( internal::getCoordinates( _x ) );
+		auto null_coors = &( internal::getCoordinates( x ) );
 		null_coors = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check whether all inputs are actually dense
-			if( internal::getCoordinates( _x ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( x ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, true, false, true
+					descr | descriptors::dense, false, true, false, true, y_zero
 				>(
-					_z, null_mask, &alpha, null_coors,
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+					z, null_mask, &alpha, null_coors,
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 					null_y, &gamma, null_coors,
 					n, ring
 				);
@@ -7391,9 +8264,11 @@ namespace grb {
 		}
 
 		// sparse case
-		return internal::eWiseMulAdd_dispatch< descr, true, true, false, true >(
-			_z, &_m, &alpha, null_coors,
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, true, false, true, y_zero
+		>(
+			z, &m, &alpha, null_coors,
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
 			null_y, &gamma, null_coors,
 			n, ring
 		);
@@ -7409,17 +8284,18 @@ namespace grb {
 	 *          monoid, followed by a call to grb::eWiseMul.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename InputType3,
 		typename OutputType, typename MaskType,
 		typename Coords
 	>
 	RC eWiseMulAdd(
-		Vector< OutputType, reference, Coords > &_z,
-		const Vector< MaskType, reference, Coords > &_m,
-		const Vector< InputType1, reference, Coords > &_a,
-		const Vector< InputType2, reference, Coords > &_x,
-		const Vector< InputType3, reference, Coords > &_y,
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
+		const Vector< InputType1, reference, Coords > &a,
+		const Vector< InputType2, reference, Coords > &x,
+		const Vector< InputType3, reference, Coords > &y,
 		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
@@ -7430,90 +8306,98 @@ namespace grb {
 			!grb::is_object< MaskType >::value, void
 		>::type * const = nullptr
 	) {
-		(void) ring;
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a left-hand vector _a with an element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMulAdd",
 			"called with a right-hand vector _x with an element type that does not "
 			"match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, InputType3 >::value ),
+			"grb::eWiseMulAdd",
 			"called with an additive vector _y with an element type that does not "
 			"match the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseMulAdd",
+				std::is_same< typename Ring::D4, OutputType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a result vector _z with an element type that does not match "
 			"the fourth domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
 
 		// catch empty mask
-		const size_t m = internal::getCoordinates( _m ).size();
-		if( m == 0 ) {
-			return eWiseMulAdd< descr >( _z, _a, _x, _y, ring, phase );
-		}
-
-		// catch trivial cases
-		if( internal::getCoordinates( _a ).nonzeroes() == 0 ) {
-			return foldr< descr >( _y, _m, _z, ring.getAdditiveMonoid(), phase );
-		}
-		if( internal::getCoordinates( _x ).nonzeroes() == 0 ) {
-			return foldr< descr >( _y, _m, _z, ring.getAdditiveMonoid(), phase );
-		}
-		if( internal::getCoordinates( _y ).nonzeroes() == 0 ) {
-			return eWiseMulAdd< descr >( _z, _m, _a, _x,
-				ring.template getZero< typename Ring::D4 >(), ring, phase );
+		if( size( m ) == 0 ) {
+			return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
 		}
 
 		// dynamic sanity checks
-		const size_t n = internal::getCoordinates( _z ).size();
-		if( internal::getCoordinates( _x ).size() != n ||
-			internal::getCoordinates( _y ).size() != n ||
-			internal::getCoordinates( _a ).size() != n ||
-			m != n
-		) {
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n || size( a ) != n || size( m ) != n ) {
 			return MISMATCH;
 		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( a ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
 
+		// catch trivial phase
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
+		// catch trivial dispatches
+		if( nnz( a ) == 0 || nnz( x ) == 0 ) {
+			return foldr< descr >( y, m, z, ring.getAdditiveMonoid(), phase );
+		}
+		if( nnz( y ) == 0 ) {
+			return eWiseMulAdd< descr >(
+				z, m, a, x,
+				ring.template getZero< typename Ring::D4 >(),
+				ring, phase
+			);
+		}
+
 		const Vector< bool, reference, Coords > * const null_mask = nullptr;
 		constexpr bool maybe_sparse = !(descr & descriptors::dense);
 		if( maybe_sparse ) {
 			// check for dense variant
-			if( internal::getCoordinates( _x ).nonzeroes() == n &&
-				internal::getCoordinates( _y ).nonzeroes() == n &&
-				internal::getCoordinates( _a ).nonzeroes() == n && (
-					internal::getCoordinates( _m ).nonzeroes() == n &&
-					(descr & descriptors::structural) &&
-					!(descr & descriptors::invert_mask)
-				)
-			) {
+			if( nnz( z ) == n && nnz( x ) == n && nnz( y ) == n && nnz( a ) == n && (
+				nnz( m ) == n &&
+				(descr & descriptors::structural) &&
+				!(descr & descriptors::invert_mask)
+			) ) {
 				// yes, dispatch to version with dense descriptor set
 				return internal::eWiseMulAdd_dispatch<
-					descr | descriptors::dense, false, false, false, false
+					descr | descriptors::dense, false, false, false, false, false
 				>(
-					_z, null_mask,
-					internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-					internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-					&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+					z, null_mask,
+					internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+					internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+					&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 					n, ring
 				);
 			}
 		}
-		return internal::eWiseMulAdd_dispatch< descr, true, false, false, false >(
-			_z, &_m,
-			internal::getRaw( _a ), &( internal::getCoordinates( _a ) ),
-			internal::getRaw( _x ), &( internal::getCoordinates( _x ) ),
-			&_y, internal::getRaw( _y ), &( internal::getCoordinates( _y ) ),
+
+		// sparse or dense variant
+		return internal::eWiseMulAdd_dispatch<
+			descr, true, false, false, false, false
+		>(
+			z, &m,
+			internal::getRaw( a ), &( internal::getCoordinates( a ) ),
+			internal::getRaw( x ), &( internal::getCoordinates( x ) ),
+			&y, internal::getRaw( y ), &( internal::getCoordinates( y ) ),
 			n, ring
 		);
 	}
@@ -7552,29 +8436,41 @@ namespace grb {
 		>::type * const = nullptr
 	) {
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ),
+				std::is_same< typename Ring::D1, InputType1 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"First domain of semiring does not match first input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ),
+				std::is_same< typename Ring::D2, InputType2 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Second domain of semiring does not match second input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ),
+				std::is_same< typename Ring::D4, InputType3 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match third input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ),
+				std::is_same< typename Ring::D4, OutputType >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match output type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector with a non-bool element type" );
 #ifdef _DEBUG
 		std::cout << "eWiseMulAdd (reference, vector <- scalar x scalar + vector, "
 			<< "masked) precomputes scalar multiply and dispatches to eWiseAdd "
 			<< "(reference, vector <- scalar + vector, masked)\n";
 #endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+		}
+
 		typename Ring::D3 mul_result;
 		RC rc = grb::apply( mul_result, alpha, beta,
 			ring.getMultiplicativeOperator() );
@@ -7583,7 +8479,7 @@ namespace grb {
 #else
 		assert( rc == SUCCESS );
 #endif
-		return grb::eWiseAdd( z, m, mul_result, y, ring, phase );
+		return grb::eWiseAdd< descr >( z, m, mul_result, y, ring, phase );
 	}
 
 	/**
@@ -7619,60 +8515,57 @@ namespace grb {
 		>::type * const = nullptr
 	) {
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ),
+				std::is_same< typename Ring::D1, InputType1 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"First domain of semiring does not match first input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ),
+				std::is_same< typename Ring::D2, InputType2 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Second domain of semiring does not match second input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, InputType3 >::value ),
+				std::is_same< typename Ring::D4, InputType3 >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match third input type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D4, OutputType >::value ),
+				std::is_same< typename Ring::D4, OutputType >::value ),
 			"grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
 			"Fourth domain of semiring does not match output type" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector with a non-bool element type" );
 #ifdef _DEBUG
 		std::cout << "eWiseMulAdd (reference, vector <- scalar x scalar + scalar, "
-			<< "masked) precomputes scalar operations and dispatches to set "
+			<< "masked) precomputes scalar operations and dispatches to foldl "
 			<< "(reference, masked)\n";
 #endif
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
 		typename Ring::D3 mul_result;
 		RC rc = grb::apply( mul_result, alpha, beta,
 			ring.getMultiplicativeOperator() );
+		assert( rc == SUCCESS );
 #ifdef NDEBUG
 		(void) rc;
 #endif
-		assert( rc == SUCCESS );
 		typename Ring::D4 add_result;
 		rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+		assert( rc == SUCCESS );
 #ifdef NDEBUG
 		(void) rc;
 #endif
-		assert( rc == SUCCESS );
-		return grb::set( z, m, add_result, phase );
-	}
-
-	/**
-	 * Calculates the element-wise multiplication of two vectors,
-	 *     \f$ z = z + x .* y \f$,
-	 * under a given semiring.
-	 *
-	 * @tparam descr      The descriptor to be used (descriptors::no_operation
-	 *                    if left unspecified).
-	 * @tparam Ring       The semiring type to perform the element-wise multiply
-	 *                    on.
-	 * @tparam InputType1 The left-hand side input type to the multiplicative
-	 *                    operator of the \a ring.
-	 * @tparam InputType2 The right-hand side input type to the multiplicative
-	 *                    operator of the \a ring.
-	 * @tparam OutputType The the result type of the multiplicative operator of
-	 *                    the \a ring.
+		return grb::foldl( z, m, add_result, ring.getAdditiveMonoid(), phase );
+	}
+
+	/**
+	 * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+	 * under a given semiring.
 	 *
 	 * @param[out]  z  The output vector of type \a OutputType.
 	 * @param[in]   x  The left-hand input vector of type \a InputType1.
@@ -7687,19 +8580,6 @@ namespace grb {
 	 * @return grb::SUCCESS  On successful completion of this call.
 	 *
 	 * \parblock
-	 * \par Valid descriptors
-	 * grb::descriptors::no_operation, grb::descriptors::no_casting.
-	 *
-	 * \note Invalid descriptors will be ignored.
-	 *
-	 * If grb::descriptors::no_casting is specified, then 1) the first domain of
-	 * \a ring must match \a InputType1, 2) the second domain of \a ring must match
-	 * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
-	 * one of these is not true, the code shall not compile.
-	 *
-	 * \endparblock
-	 *
-	 * \parblock
 	 * \par Performance semantics
 	 *      -# This call takes \f$ \Theta(n) \f$ work, where \f$ n \f$ equals the
 	 *         size of the vectors \a x, \a y, and \a z. The constant factor
@@ -7720,15 +8600,10 @@ namespace grb {
 	 *         operator in-place, whenever the input domains, the output domain,
 	 *         and the operator used allow for this.
 	 * \endparblock
-	 *
-	 * \warning When given sparse vectors, the zero now annihilates instead of
-	 *       acting as an identity. Thus the eWiseMul cannot simply map to an
-	 *       eWiseApply of the multiplicative operator.
-	 *
-	 * @see This is a specialised form of eWiseMulAdd.
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
@@ -7736,7 +8611,7 @@ namespace grb {
 		Vector< OutputType, reference, Coords > &z,
 		const Vector< InputType1, reference, Coords > &x,
 		const Vector< InputType2, reference, Coords > &y,
-		const Ring & ring = Ring(),
+		const Ring &ring = Ring(),
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< OutputType >::value &&
 			!grb::is_object< InputType1 >::value &&
@@ -7746,32 +8621,63 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n || size( y ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) { return SUCCESS; }
+
+		// check if trivial
+		if( nnz( x ) == 0 || nnz( y ) == 0 ) {
+#ifdef _DEBUG
+			std::cout << "eWiseMul (reference, vector <- vector x vector "
+				<< "has at least one empty input vector, which results "
+				<< "in a no-op\n";
+#endif
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
 		std::cout << "eWiseMul (reference, vector <- vector x vector) dispatches "
 			<< "to eWiseMulAdd (vector <- vector x vector + 0)\n";
 #endif
-		return eWiseMulAdd< descr >( z, x, y, ring.template getZero< Ring::D4 >(),
-			ring, phase );
+		return eWiseMulAdd< descr, true >(
+			z, x, y,
+			ring.template getZero< typename Ring::D4 >(),
+			ring, phase
+		);
 	}
 
 	/**
-	 * Computes \f$ z = z + x * y \f$.
+	 * In-place element-wise multiplication of a scalar and vector,
+	 * \f$ z += \alpha .* y \f$, under a given semiring.
 	 *
-	 * Specialisation for scalar \a x.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
@@ -7789,32 +8695,57 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( y ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+		}
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( nnz( y ) == 0 ) { return SUCCESS; }
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
 		std::cout << "eWiseMul (reference, vector <- scalar x vector) dispatches to "
 			<< "eWiseMulAdd (vector <- scalar x vector + 0)\n";
 #endif
-		return eWiseMulAdd< descr >( z, alpha, y,
+		return eWiseMulAdd< descr, true >( z, alpha, y,
 			ring.template getZero< typename Ring::D4 >(), ring, phase );
 	}
 
 	/**
-	 * Computes \f$ z = z + x * y \f$.
+	 * In-place element-wise multiplication of a vector and scalar,
+	 * \f$ z += x .* \beta \f$, under a given semiring.
 	 *
-	 * Specialisation for scalar \a y.
+	 * \todo Add performance semantics
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename InputType1, typename InputType2, typename OutputType,
 		typename Coords
 	>
@@ -7832,34 +8763,130 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( x ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+		}
+
+		// catch trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( nnz( x ) == 0 ) { return SUCCESS; }
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
-		std::cout << "eWiseMul (reference) dispatches to eWiseMulAdd with 0.0 as additive scalar\n";
+		std::cout << "eWiseMul (reference) dispatches to eWiseMulAdd with 0.0 as "
+			<< "additive scalar\n";
 #endif
-		return eWiseMulAdd< descr >(
+
+		return eWiseMulAdd< descr, true >(
 			z, x, beta,
 			ring.template getZero< typename Ring::D4 >(),
-			ring.getMultiplicativeOperator(),
-			phase
+			ring, phase
 		);
 	}
 
 	/**
-	 * Calculates the element-wise multiplication of two vectors,
-	 *     \f$ z = z + x .* y \f$,
-	 * under a given semiring.
+	 * In-place element-wise multiplication of two scalars,
+	 * \f$ z += \alpha .* \beta \f$, under a given semiring.
+	 *
+	 * \todo Add performance semantics.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1, typename InputType2, typename OutputType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, reference, Coords > &z,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+
+		// dynamic checks
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < size( z ) ) { return ILLEGAL; }
+		}
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (reference) dispatches to eWiseMulAdd with 0.0 as "
+			<< "additive scalar\n";
+#endif
+		typename Ring::D3 temp;
+		RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, temp, ring.getAdditiveMonoid(), phase );
+	}
+
+	/**
+	 * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+	 * under a given semiring, masked variant.
 	 *
-	 * Masked verison.
+	 * \todo Add performance semantics.
 	 *
 	 * \internal Dispatches to eWiseMulAdd with zero additive scalar.
 	 */
@@ -7885,32 +8912,74 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n || size( y ) != n ) {
+			return MISMATCH;
+		}
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+		}
+
+		// check trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( nnz( x ) == 0 || nnz( y ) == 0 ) {
+			return SUCCESS;
+		}
+		if( (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) &&
+			nnz( m ) == 0
+		) {
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
 		std::cout << "eWiseMul (reference, vector <- vector x vector, masked) "
 			<< "dispatches to eWiseMulAdd (vector <- vector x vector + 0, masked)\n";
 #endif
-		return eWiseMulAdd< descr >( z, m, x, y,
-			ring.template getZero< Ring::D4 >(), ring, phase );
+		return eWiseMulAdd< descr, true >(
+			z, m, x, y,
+			ring.template getZero< typename Ring::D4 >(),
+			ring, phase
+		);
 	}
 
 	/**
-	 * Computes \f$ z = z + x * y \f$.
+	 * In-place element-wise multiplication of a scalar and vector,
+	 * \f$ z += \alpha .* y \f$, under a given semiring, masked variant.
 	 *
-	 * Specialisation for scalar \a x, masked version.
+	 * \todo Add performance semantics.
 	 *
 	 * \internal Dispatches to eWiseMulAdd with zero additive scalar.
 	 */
@@ -7936,32 +9005,71 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, y, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( y ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( y ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( nnz( y ) == 0 ) {
+			return SUCCESS;
+		}
+		if( (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) &&
+			nnz( m ) == 0
+		) {
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
 		std::cout << "eWiseMul (reference, vector <- scalar x vector, masked) "
 			<< "dispatches to eWiseMulAdd (vector <- scalar x vector + 0, masked)\n";
 #endif
-		return eWiseMulAdd< descr >( z, m, alpha, y,
+		return eWiseMulAdd< descr, true >( z, m, alpha, y,
 			ring.template getZero< typename Ring::D4 >(), ring, phase );
 	}
 
 	/**
-	 * Computes \f$ z = z + x * y \f$.
+	 * In-place element-wise multiplication of a vector and scalar,
+	 * \f$ z += x .* \beta \f$, under a given semiring, masked variant.
 	 *
-	 * Specialisation for scalar \a y, masked version.
+	 * \todo Add performance semantics.
 	 *
 	 * \internal Dispatches to eWiseMulAdd with zero additive scalar.
 	 */
@@ -7987,31 +9095,160 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
 			"called with a left-hand side input vector with element type that does not "
 			"match the first domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
 			"called with a right-hand side input vector with element type that does "
 			"not match the second domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
 			"called with an output vector with element type that does not match the "
 			"third domain of the given semiring" );
 		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
 			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, x, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n || size( x ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( x ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( nnz( x ) == 0 ) { return SUCCESS; }
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+		if( (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) &&
+			nnz( m ) == 0
+		) {
+			return SUCCESS;
+		}
+
 #ifdef _DEBUG
-		std::cout << "eWiseMul (reference, masked) dispatches to masked eWiseMulAdd with 0.0 as additive scalar\n";
+		std::cout << "eWiseMul (reference, masked) dispatches to masked eWiseMulAdd "
+			<< "with 0.0 as additive scalar\n";
 #endif
-		return eWiseMulAdd< descr >(
+		return eWiseMulAdd< descr, true >(
 			z, m, x, beta,
 			ring.template getZero< typename Ring::D4 >(),
-			ring.getMultiplicativeOperator(),
-			phase
+			ring, phase
 		);
 	}
 
+	/**
+	 * In-place element-wise multiplication of two scalars,
+	 * \f$ z += \alpha .* \beta \f$, under a given semiring, masked variant.
+	 *
+	 * \todo Add performance semantics.
+	 */
+	template<
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
+		typename InputType1, typename InputType2,
+		typename OutputType, typename MaskType,
+		typename Coords
+	>
+	RC eWiseMul(
+		Vector< OutputType, reference, Coords > &z,
+		const Vector< MaskType, reference, Coords > &m,
+		const InputType1 alpha,
+		const InputType2 beta,
+		const Ring &ring = Ring(),
+		const Phase &phase = EXECUTE,
+		const typename std::enable_if< !grb::is_object< OutputType >::value &&
+			!grb::is_object< InputType1 >::value &&
+			!grb::is_object< InputType2 >::value &&
+			!grb::is_object< MaskType >::value &&
+			grb::is_semiring< Ring >::value, void
+		>::type * const = nullptr
+	) {
+		// static sanity checks
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D1, InputType1 >::value ),
+			"grb::eWiseMul",
+			"called with a left-hand side input vector with element type that does not "
+			"match the first domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D2, InputType2 >::value ),
+			"grb::eWiseMul",
+			"called with a right-hand side input vector with element type that does "
+			"not match the second domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< typename Ring::D3, OutputType >::value ),
+			"grb::eWiseMul",
+			"called with an output vector with element type that does not match the "
+			"third domain of the given semiring" );
+		NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+				std::is_same< bool, MaskType >::value ),
+			"grb::eWiseMulAdd",
+			"called with a mask vector _m with a non-bool element type" );
+
+		// check for empty mask
+		if( size( m ) == 0 ) {
+			return eWiseMul< descr >( z, alpha, beta, ring, phase );
+		}
+
+		// dynamic checks
+		const size_t n = size( z );
+		if( size( m ) != n ) { return MISMATCH; }
+		if( descr & descriptors::dense ) {
+			if( nnz( z ) < n ) { return ILLEGAL; }
+			if( nnz( m ) < n ) { return ILLEGAL; }
+		}
+
+		// check for trivial phase
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+
+		// check trivial
+		if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+			return SUCCESS;
+		}
+		if( beta == ring.template getZero< typename Ring::D2 >() ) {
+			return SUCCESS;
+		}
+		if( (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) &&
+			nnz( m ) == 0
+		) {
+			return SUCCESS;
+		}
+
+#ifdef _DEBUG
+		std::cout << "eWiseMul (reference, masked) dispatches to masked foldl\n";
+#endif
+		typename Ring::D3 temp;
+		const RC always_success = apply( temp, alpha, beta,
+			ring.getMultiplicativeOperator() );
+		assert( always_success == SUCCESS );
+#ifdef NDEBUG
+		(void) always_success;
+#endif
+		return foldl< descr >( z, m, temp, ring.getAdditiveMonoid(), EXECUTE );
+	}
+
 	// internal namespace for implementation of grb::dot
 	namespace internal {
 
@@ -8022,7 +9259,8 @@ namespace grb {
 			typename OutputType, typename InputType1, typename InputType2,
 			typename Coords
 		>
-		RC dot_generic( OutputType &z,
+		RC dot_generic(
+			OutputType &z,
 			const Vector< InputType1, reference, Coords > &x,
 			const Vector< InputType2, reference, Coords > &y,
 			const AddMonoid &addMonoid,
@@ -8235,36 +9473,9 @@ namespace grb {
 	} // namespace internal
 
 	/**
-	 * Calculates the dot product, \f$ \alpha = (x,y) \f$, under a given additive
+	 * Calculates the dot product, \f$ z += (x,y) \f$, under a given additive
 	 * monoid and multiplicative operator.
 	 *
-	 * @tparam descr      The descriptor to be used (descriptors::no_operation
-	 *                    if left unspecified).
-	 * @tparam Ring       The semiring type to use.
-	 * @tparam OutputType The output type.
-	 * @tparam InputType1 The input element type of the left-hand input vector.
-	 * @tparam InputType2 The input element type of the right-hand input vector.
-	 *
-	 * @param[in,out]  z    The output element \f$ z + \alpha \f$.
-	 * @param[in]      x    The left-hand input vector.
-	 * @param[in]      y    The right-hand input vector.
-	 * @param[in] addMonoid The additive monoid under which the reduction of the
-	 *                      results of element-wise multiplications of \a x and
-	 *                      \a y are performed.
-	 * @param[in]   anyop   The multiplicative operator under which element-wise
-	 *                      multiplications of \a x and \a y are performed. This can
-	 *                      be any binary operator.
-	 *
-	 * By the definition that a dot-product operates under any additive monoid and
-	 * any binary operator, it follows that a dot-product under any semiring can be
-	 * trivially reduced to a call to this version instead.
-	 *
-	 * @return grb::MISMATCH When the dimensions of \a x and \a y do not match. All
-	 *                       input data containers are left untouched if this exit
-	 *                       code is returned; it will be as though this call was
-	 *                       never made.
-	 * @return grb::SUCCESS  On successful completion of this call.
-	 *
 	 * \parblock
 	 * \par Performance semantics
 	 *      -# This call takes \f$ \Theta(n/p) \f$ work at each user process, where
@@ -8299,15 +9510,6 @@ namespace grb {
 	 *   -# grb::descriptors::no_casting
 	 *   -# grb::descriptors::dense
 	 * \endparblock
-	 *
-	 * If the dense descriptor is set, this implementation returns grb::ILLEGAL if
-	 * it was detected that either \a x or \a y was sparse. In this case, it shall
-	 * otherwise be as though the call to this function had not occurred (no side
-	 * effects).
-	 *
-	 * \note The standard, in contrast, only specifies undefined behaviour would
-	 *       occur. This implementation goes beyond the standard by actually
-	 *       specifying what will happen.
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
@@ -8315,7 +9517,8 @@ namespace grb {
 		typename OutputType, typename InputType1, typename InputType2,
 		typename Coords
 	>
-	RC dot( OutputType &z,
+	RC dot(
+		OutputType &z,
 		const Vector< InputType1, reference, Coords > &x,
 		const Vector< InputType2, reference, Coords > &y,
 		const AddMonoid &addMonoid = AddMonoid(),
@@ -8436,6 +9639,10 @@ namespace grb {
 	}
 
 	/**
+	 * Calculates the dot product, \f$ \alpha = (x,y) \f$, under a given semiring.
+	 *
+	 * \todo Add performance semantics.
+	 *
 	 * \internal
 	 * Provides a generic implementation of the dot computation on semirings by
 	 * translating it into a dot computation on an additive commutative monoid
@@ -8443,11 +9650,13 @@ namespace grb {
 	 * \endinternal
 	 */
 	template<
-		Descriptor descr = descriptors::no_operation, class Ring,
+		Descriptor descr = descriptors::no_operation,
+		class Ring,
 		typename IOType, typename InputType1, typename InputType2,
 		typename Coords
 	>
-	RC dot( IOType &x,
+	RC dot(
+		IOType &x,
 		const Vector< InputType1, reference, Coords > &left,
 		const Vector< InputType2, reference, Coords > &right,
 		const Ring &ring = Ring(),
@@ -8653,12 +9862,14 @@ namespace grb {
 		const Vector< InputType, reference, Coords > &y,
 		const Vector< MaskType, reference, Coords > &mask,
 		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if< !grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
 			!grb::is_object< MaskType >::value &&
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		(void) phase;
 #ifdef _DEBUG
 		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
 			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
@@ -8689,11 +9900,11 @@ namespace grb {
 		if( size( mask ) > 0 ) {
 			return internal::template fold_from_vector_to_scalar_generic<
 				descr, true, true
-			>( x, y, mask, monoid.getOperator() );
+			>( x, y, mask, monoid );
 		} else {
 			return internal::template fold_from_vector_to_scalar_generic<
 				descr, false, true
-			>( x, y, mask, monoid.getOperator() );
+			>( x, y, mask, monoid );
 		}
 	}
 
@@ -8716,12 +9927,14 @@ namespace grb {
 		IOType &x,
 		const Vector< InputType, reference, Coords > &y,
 		const Monoid &monoid = Monoid(),
+		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< IOType >::value &&
 			!grb::is_object< InputType >::value &&
 			grb::is_monoid< Monoid >::value, void
 		>::type * const = nullptr
 	) {
+		(void) phase;
 #ifdef _DEBUG
 		std::cout << "foldl: IOType <- [InputType] with a monoid called. "
 			<< "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
@@ -8749,7 +9962,7 @@ namespace grb {
 		Vector< bool, reference, Coords > empty_mask( 0 );
 		return internal::template fold_from_vector_to_scalar_generic<
 			descr, false, true
-		>( x, y, empty_mask, monoid.getOperator() );
+		>( x, y, empty_mask, monoid );
 	}
 
 	/**
diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp
index c9c30c7a7..40acf6531 100644
--- a/include/graphblas/reference/blas2.hpp
+++ b/include/graphblas/reference/blas2.hpp
@@ -44,7 +44,7 @@
 #include "vector.hpp"
 
 #ifdef _DEBUG
-#include "spmd.hpp"
+ #include "spmd.hpp"
 #endif
 
 #define NO_CAST_ASSERT( x, y, z )                                          \
@@ -257,6 +257,7 @@ namespace grb {
 		 * @param[in]     source_range  The number of elements in \a source.
 		 * @param[in]     matrix        A view of the sparsity pattern and nonzeroes
 		 *                              (if applicable) of the input matrix.
+		 * @param[in]     nz            The number of nonzeroes in the matrix.
 		 * @param[in]     mask_vector   A view of the mask vector. If \a masked is
 		 *                              \a true, the dimensions must match that of
 		 *                              \a destination_vector.
@@ -303,8 +304,10 @@ namespace grb {
 			const Vector< InputType1, reference, Coords > &source_vector,
 			const InputType1 * __restrict__ const &source,
 			const size_t &source_range,
-			const internal::Compressed_Storage< InputType2, RowColType, NonzeroType >
-				&matrix,
+			const internal::Compressed_Storage<
+					InputType2, RowColType, NonzeroType
+				> &matrix,
+			const size_t &nz,
 			const Vector< InputType3, reference, Coords > &mask_vector,
 			const InputType3 * __restrict__ const &mask,
 			const Vector< InputType4, reference, Coords > &source_mask_vector,
@@ -321,7 +324,19 @@ namespace grb {
 #ifdef _DEBUG
 			constexpr bool use_index = descr & descriptors::use_index;
 #endif
-			assert( rc == SUCCESS );
+#ifndef NDEBUG
+			// some assertions that are safe, signed- and width-wise
+			{
+				const size_t col_off = static_cast< size_t >(
+					matrix.col_start[ destination_index ] );
+				const size_t col_off_p1 = static_cast< size_t >(
+					matrix.col_start[ destination_index + 1 ] );
+				const size_t nzsz = static_cast< size_t >( nz );
+				assert( rc == SUCCESS );
+				assert( col_off <= nzsz );
+				assert( col_off_p1 <= nzsz );
+			}
+#endif
 
 			// check whether we should compute output here
 			if( masked ) {
@@ -345,6 +360,7 @@ namespace grb {
 			}
 
 			// start output
+			const auto &src_coordinates = internal::getCoordinates( source_vector );
 			typename AdditiveMonoid::D3 output =
 				add.template getIdentity< typename AdditiveMonoid::D3 >();
 			bool set = false;
@@ -359,9 +375,7 @@ namespace grb {
 							mask< descr >( id_location, source_mask )
 					) && id_location < source_range
 				) {
-					if( dense_hint ||
-						internal::getCoordinates( source_vector ).assigned( id_location )
-					) {
+					if( dense_hint || src_coordinates.assigned( id_location ) ) {
 						typename AdditiveMonoid::D1 temp;
 						internal::CopyOrApplyWithIdentity<
 							!left_handed, typename AdditiveMonoid::D1, InputType1, One
@@ -409,13 +423,34 @@ namespace grb {
 				}
 				// check for sparsity at source
 				if( !dense_hint ) {
-					if( !internal::getCoordinates( source_vector ).assigned( source_index ) ) {
+					if( config::PREFETCHING< reference >::enabled() ) {
+						size_t dist = k + 2 * config::PREFETCHING< reference >::distance();
+						if( dist < nz ) {
+							const size_t prefetch_target_assigned = matrix.row_index[ dist ];
+							src_coordinates.prefetch_assigned( prefetch_target_assigned );
+						}
+						dist -= config::PREFETCHING< reference >::distance();
+						if( dist < nz ) {
+							const size_t prefetch_target_value = matrix.row_index[ dist ];
+							if( src_coordinates.assigned( prefetch_target_value ) ) {
+								src_coordinates.prefetch_value( prefetch_target_value, source );
+							}
+						}
+					}
+					if( !src_coordinates.assigned( source_index ) ) {
 #ifdef _DEBUG
 						std::cout << "\t vxm_gather: Skipping out of computation with source "
 							<< "index " << source_index << " since it does not contain a nonzero\n";
 #endif
 						continue;
 					}
+				} else if( config::PREFETCHING< reference >::enabled() ) {
+					// prefetch nonzero
+					const size_t dist = k + config::PREFETCHING< reference >::distance();
+					if( dist < nz ) {
+						const size_t prefetch_target = matrix.row_index[ dist ];
+						src_coordinates.prefetch_value( prefetch_target, source );
+					}
 				}
 				// get nonzero
 				typedef typename std::conditional<
@@ -1010,11 +1045,11 @@ namespace grb {
 					"overlapping input and output vectors.\n";
 				return OVERLAP;
 			}
-			/*if( masked && (reinterpret_cast<const void*>(y) == reinterpret_cast<const void*>(z)) ) {
+			if( masked && (reinterpret_cast<const void*>(y) == reinterpret_cast<const void*>(z)) ) {
 				std::cerr << "Warning: grb::internal::vxm_generic called with "
 					"overlapping mask and output vectors.\n";
 				return OVERLAP;
-			}*/
+			}
 
 #ifdef _DEBUG
 			std::cout << s << ": performing SpMV / SpMSpV using an " << nrows( A )
@@ -1205,7 +1240,8 @@ namespace grb {
 									local_update, asyncAssigns,
 #endif
 									u, y[ i ], i, v, x,
-									nrows( A ), internal::getCRS( A ), mask, z, v_mask, vm,
+									nrows( A ), internal::getCRS( A ), nnz( A ),
+									mask, z, v_mask, vm,
 									add, mul, row_l2g, col_l2g, col_g2l
 								);
 #ifdef _H_GRB_REFERENCE_OMP_BLAS2
@@ -1250,7 +1286,8 @@ namespace grb {
 									local_update, asyncAssigns,
 #endif
 									u, y[ i ], i, v, x,
-									nrows( A ), internal::getCRS( A ), mask, z, v_mask, vm,
+									nrows( A ), internal::getCRS( A ), nnz( A ),
+									mask, z, v_mask, vm,
 									add, mul, row_l2g, col_l2g, col_g2l
 								);
 #ifdef _H_GRB_REFERENCE_OMP_BLAS2
@@ -1412,8 +1449,8 @@ namespace grb {
 									local_update, asyncAssigns,
 #endif
 									u, y[ j ], j,
-									v, x, nrows( A ),
-									internal::getCCS( A ),
+									v, x,
+									nrows( A ), internal::getCCS( A ), nnz( A ),
 									mask, z, v_mask, vm,
 									add, mul,
 									row_l2g, row_g2l, col_l2g
@@ -1450,7 +1487,8 @@ namespace grb {
 									local_update, asyncAssigns,
 #endif
 									u, y[ j ], j, v, x,
-									nrows( A ), internal::getCCS( A ), mask, z, v_mask, vm,
+									nrows( A ), internal::getCCS( A ), nnz( A ),
+									mask, z, v_mask, vm,
 									add, mul, row_l2g, row_g2l, col_l2g
 								);
 #ifdef _H_GRB_REFERENCE_OMP_BLAS2
@@ -1732,7 +1770,58 @@ namespace grb {
 		return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring, phase );
 	}
 
-	/** \internal Delegates to vxm_generic */
+	/**
+	 * \parblock
+	 * Performance semantics vary depending on whether a mask was provided, and on
+	 * whether the input vector is sparse or dense. If the input vector \f$ v \f$
+	 * is sparse, let \f$ J \f$ be its set of assigned indices. If a non-trivial
+	 * mask \f$ \mathit{mask} \f$ is given, let \f$ I \f$ be the set of indices for
+	 * which the corresponding \f$ \mathit{mask}_i \f$ evaluate <tt>true</tt>. Then:
+	 *   -# For the performance guarantee on the amount of work this function
+	 *      entails the following table applies:<br>
+	 *      \f$ \begin{tabular}{cccc}
+	 *           Masked & Dense input  & Sparse input \\
+	 *           \noalign{\smallskip}
+	 *           no  & $\Theta(2\mathit{nnz}(A))$      & $\Theta(2\mathit{nnz}(A_{:,J}))$ \\
+	 *           yes & $\Theta(2\mathit{nnz}(A_{I,:})$ & $\Theta(\min\{2\mathit{nnz}(A_{I,:}),2\mathit{nnz}(A_{:,J})\})$
+	 *          \end{tabular}. \f$
+	 *   -# For the amount of data movements, the following table applies:<br>
+	 *      \f$ \begin{tabular}{cccc}
+	 *           Masked & Dense input  & Sparse input \\
+	 *           \noalign{\smallskip}
+	 *           no  & $\Theta(\mathit{nnz}(A)+\min\{m,n\}+m+n)$                         & $\Theta(\mathit{nnz}(A_{:,J}+\min\{m,2|J|\}+|J|)+\mathcal{O}(2m)$ \\
+	 *           yes & $\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n)$ &
+	 * $\Theta(\min\{\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n),\mathit{nnz}(A_{:,J}+\min\{m,|J|\}+2|J|)+\mathcal{O}(2m))$ \end{tabular}. \f$
+	 *   -# A call to this function under no circumstance will allocate nor free
+	 *      dynamic memory.
+	 *   -# A call to this function under no circumstance will make system calls.
+	 * The above performance bounds may be changed by the following desciptors:
+	 *   * #descriptors::invert_mask: replaces \f$ \Theta(|I|) \f$ data movement
+	 *     costs with a \f$ \mathcal{O}(2m) \f$ cost instead, or a
+	 *     \f$ \mathcal{O}(m) \f$ cost if #descriptors::structural was defined as
+	 *     well (see below). In other words, implementations are not required to
+	 *     implement inverted operations efficiently (\f$ 2\Theta(m-|I|) \f$ data
+	 *     movements would be optimal but costs another \f$ \Theta(m) \f$ memory
+	 *     to maintain).
+	 *   * #descriptors::structural: removes \f$ \Theta(|I|) \f$ data movement
+	 *     costs as the mask values need no longer be touched.
+	 *   * #descriptors::add_identity: adds, at most, the costs of grb::foldl
+	 *     (on vectors) to all performance metrics.
+	 *   * #descriptors::use_index: removes \f$ \Theta(n) \f$ or
+	 *     \f$ \Theta(|J|) \f$ data movement costs as the input vector values need
+	 *     no longer be touched.
+	 *   * #descriptors::in_place (see also above): turns \f$ \mathcal{O}(2m) \f$
+	 *     data movements into \f$ \mathcal{O}(m) \f$ instead; i.e., it halves the
+	 *     amount of data movements for writing the output.
+	 *   * #descriptors::dense: the input, output, and mask vectors are assumed to
+	 *     be dense. This allows the implementation to skip checks or other code
+	 *     blocks related to handling of sparse vectors. This may result in use of
+	 *     unitialised memory if any of the provided vectors were, in fact,
+	 *     sparse.
+	 * \endparblock
+	 *
+	 * \internal Delegates to vxm_generic
+	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
 		bool output_may_be_masked = true,
diff --git a/include/graphblas/reference/compressed_storage.hpp b/include/graphblas/reference/compressed_storage.hpp
index 34d2e99f1..fa1aac97c 100644
--- a/include/graphblas/reference/compressed_storage.hpp
+++ b/include/graphblas/reference/compressed_storage.hpp
@@ -1175,12 +1175,19 @@ namespace grb {
 				/**
 				 * \internal copyFrom specialisation for pattern matrices.
 				 */
-				template< bool use_id, typename InputType, typename UnusedType = void >
+				template<
+					bool use_id = false,
+					typename InputType,
+					typename UnusedType = void
+				>
 				void copyFrom(
 					const Compressed_Storage< InputType, IND, SIZE > &other,
 					const size_t nz, const size_t m, const size_t start, size_t end,
 					const UnusedType * __restrict__ = nullptr
 				) {
+					// the use_id template is meaningless in the case of pattern matrices, but
+					// is retained to keep the API the same as with the non-pattern case.
+					(void) use_id;
 #ifdef _DEBUG
 					std::cout << "CompressedStorage::copyFrom (void) called with range "
 						<< start << "--" << end << "\n";
diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp
index c105409b7..b33019e7b 100644
--- a/include/graphblas/reference/config.hpp
+++ b/include/graphblas/reference/config.hpp
@@ -30,19 +30,27 @@
 
 #include <graphblas/base/config.hpp>
 
-namespace grb {
 
-	/**
-	 * \defgroup reference The reference backend implementation
-	 *
-	 * Groups all definitions and documentations corresponding to the #reference
-	 * and #reference_omp implementations.
-	 * @{
-	 */
+namespace grb {
 
 	namespace config {
 
-		/** The memory allocation modes implemented here. */
+		/**
+		 * \defgroup referenceConfig Reference and reference_omp backend configuration
+		 * \ingroup config
+		 *
+		 * All configuration parameters for the #grb::reference and the
+		 * #grb::reference_omp backends.
+		 *
+		 * @{
+		 */
+
+		/**
+		 * The memory allocation modes implemented in the #grb::reference and the
+		 * #grb::reference_omp backends.
+		 *
+		 * \ingroup reference
+		 */
 		enum ALLOC_MODE {
 
 			/** Allocation via <tt>posix_memalign</tt>. */
@@ -54,15 +62,79 @@ namespace grb {
 		};
 
 		/**
-		 * Converts instances of #grb::config::MEMORY::ALLOC_MODE to a descriptive
+		 * Converts instances of #grb::config::ALLOC_MODE to a descriptive
 		 * lower-case string.
+		 *
+		 * \ingroup reference
 		 */
 		std::string toString( const ALLOC_MODE mode );
 
 		/**
-		 * Configuration parameters that may depend on the implementation.
+		 * Default prefetching settings for reference and reference_omp backends.
+		 *
+		 * \note By default, prefetching is turned OFF as we found no setting that
+		 *       will never result in a performance degradation across the dataset,
+		 *       workloads, and architectures in our standard test set.
+		 *
+		 * \note The defaults may be overridden by specialisation, which additionally
+		 *       makes it possible to choose different distances for different
+		 *       backends.
+		 *
+		 * Prefetching presently only is implemented and evaluated for the SpMV and
+		 * the SpMSpV multiplication kernels. Furthermore, it is only implemented for
+		 * the gathering variant of either kernel. If you wish further support or
+		 * evaluation, please feel free to create an issue or to contribute a merge
+		 * request.
+		 *
+		 * \internal
+		 * \warning This class should only be used by the reference or reference_omp
+		 *          backends.
+		 * \endinternal
 		 *
-		 * \todo Internal issue #98.
+		 * \ingroup reference
+		 */
+		template< Backend backend >
+		class PREFETCHING {
+
+			// guard against unintended use
+			static_assert( backend == reference || backend == reference_omp,
+				"Instantiating for non-reference backend" );
+
+			public:
+
+				/**
+				 * Whether prefetching is enabled.
+				 */
+				static constexpr bool enabled() {
+					return false;
+				}
+
+				/**
+				 * The prefetch distance used during level-2 and level-3 operations.
+				 *
+				 * This value will be ignored if #enabled() returns <tt>false</tt>.
+				 */
+				static constexpr size_t distance() {
+					return 128;
+				}
+
+		};
+
+		/**
+		 * This class collects configuration parameters that are specific to the
+		 * #grb::reference backend. It details both configurations that could
+		 * be modified by end users, as well as configurations that are sensible
+		 * only to ALP developers; the full specification hence is only available
+		 * within the developer documentation.
+		 *
+		 * \internal
+		 * This class extends the base implementation API with some fields that
+		 * facilitate composability between the #grb::reference and the
+		 * #grb::reference_omp backends on the one hand, and the #grb::bsp1d and the
+		 * #grb::hybrid backends on the other.
+		 * \endinternal
+		 *
+		 * \ingroup reference
 		 */
 		template<>
 		class IMPLEMENTATION< reference > {
@@ -80,25 +152,31 @@ namespace grb {
 				}
 
 				/**
+				 * \internal
 				 * Whether the backend has vector capacities always fixed to their
 				 * defaults.
+				 * \endinternal
 				 */
 				static constexpr bool fixedVectorCapacities() {
 					return true;
 				}
 
 				/**
+				 * \internal
 				 * The buffer size for allowing parallel updates to the sparsity of a
 				 * vector of a given length. In the sequential reference implementation
 				 * such a buffer is not required, hence this function will always return
 				 * 0.
+				 * \endinternal
 				 */
-				static inline size_t vectorBufferSize( const size_t, const size_t ) {
+				static constexpr size_t vectorBufferSize( const size_t, const size_t ) {
 					return 0;
 				}
 
 				/**
+				 * \internal
 				 * By default, use the coordinates of the selected backend.
+				 * \endinternal
 				 */
 				static constexpr Backend coordinatesBackend() {
 					return reference;
@@ -107,10 +185,20 @@ namespace grb {
 		};
 
 		/**
-		 * Implementation-dependent configuration parameters for the \a reference_omp
-		 * backend. Note that for the reference backend, the defaults suffice.
+		 * This class collects configuration parameters that are specific to the
+		 * #grb::reference_omp backend. It details both configurations that could
+		 * be modified by end users, as well as configurations that are sensible
+		 * only to ALP developers; the full specification hence is only available
+		 * within the developer documentation.
+		 *
+		 * \internal
+		 * This class extends the base implementation API with some fields that
+		 * facilitate composability between the #grb::reference and the
+		 * #grb::reference_omp backends on the one hand, and the #grb::bsp1d and the
+		 * #grb::hybrid backends on the other.
+		 * \endinternal
 		 *
-		 * @see grb::config::IMPLEMENTATION
+		 * \ingroup reference
 		 */
 		template<>
 		class IMPLEMENTATION< reference_omp > {
@@ -118,14 +206,17 @@ namespace grb {
 			private:
 
 				/**
+				 * \internal
 				 * If \a N independent concurrent chunks are supported for parallel sparsity
 				 * updates, then each chunk will have the returned minimum size (in bytes).
+				 * \endinternal
 				 */
 				static constexpr size_t minVectorBufferChunksize() {
 					return CACHE_LINE_SIZE::value();
 				}
 
 				/**
+				 * \internal
 				 * Vector-local buffer size for parallel sparsity updates (to vectors).
 				 *
 				 * The given buffer size is in the number of elements.
@@ -136,12 +227,14 @@ namespace grb {
 				 *
 				 * Either this or relVectorBufferSize() must be set to a different value
 				 * from 0.
+				 * \endinternal
 				 */
 				static constexpr size_t absVectorBufferSize() {
 					return 0;
 				}
 
 				/**
+				 * \internal
 				 * Vector-local buffer size for parallel sparsity updates (to vectors).
 				 *
 				 * The given buffer size is relative to the vector length.
@@ -154,6 +247,7 @@ namespace grb {
 				 *
 				 * Either this or absVectorBufferSize() must be set to a different value
 				 * from 0.
+				 * \endinternal
 				 */
 				static constexpr double relVectorBufferSize() {
 					return 1;
@@ -180,21 +274,26 @@ namespace grb {
 				}
 
 				/**
+				 * \internal
 				 * By default, use the coordinates of the selected backend.
+				 * \endinternal
 				 */
 				static constexpr Backend coordinatesBackend() {
 					return reference_omp;
 				}
 
 				/**
+				 * \internal
 				 * Whether the backend has vector capacities always fixed to their
 				 * defaults.
+				 * \endinternal
 				 */
 				static constexpr bool fixedVectorCapacities() {
 					return true;
 				}
 
 				/**
+				 * \internal
 				 * Helper function that computes the effective buffer size for a vector
 				 * of \a n elements using #absVectorBufferSize and #relVectorBufferSize
 				 * and adds \a T elements to maintain local stack sizes.
@@ -207,6 +306,7 @@ namespace grb {
 				 *
 				 * @returns The buffer size given the vector size, maximum number of
 				 *          threads, and the requested configuration.
+				 * \endinternal
 				 */
 				static inline size_t vectorBufferSize( const size_t n, const size_t T ) {
 					size_t ret;
@@ -230,9 +330,9 @@ namespace grb {
 
 		};
 
-	} // namespace config
+		/** @} */
 
-	/** @} */
+	} // namespace config
 
 } // namespace grb
 
diff --git a/include/graphblas/reference/coordinates.hpp b/include/graphblas/reference/coordinates.hpp
index b3bff6563..59ea0326f 100644
--- a/include/graphblas/reference/coordinates.hpp
+++ b/include/graphblas/reference/coordinates.hpp
@@ -20,7 +20,7 @@
  * @date 5th of July, 2017
  */
 
-#if ! defined _H_GRB_REFERENCE_COORDINATES || defined _H_GRB_REFERENCE_OMP_COORDINATES
+#if !defined _H_GRB_REFERENCE_COORDINATES || defined _H_GRB_REFERENCE_OMP_COORDINATES
 #define _H_GRB_REFERENCE_COORDINATES
 
 #include <stddef.h> //size_t
@@ -1646,6 +1646,65 @@ namespace grb {
 					return _n == _cap || _assigned[ i ];
 				}
 
+				/**
+				 * Prefetches the result of a call to #assigned with the same argument \a a.
+				 *
+				 * @param[in] i The index to prefetch.
+				 *
+				 * This is equivalent to a prefetch hint -- a call to this function may or
+				 * not translate to a no-op.
+				 *
+				 * \warning Debug-mode assertion allows for out-of-range \a i within the
+				 *          configured prefetch distance.
+				 */
+				inline void prefetch_assigned( const size_t i ) const noexcept {
+					assert( i < _cap + config::PREFETCHING< reference >::distance() );
+					__builtin_prefetch( _assigned + i );
+				}
+
+				/**
+				 * Prefetches a nonzero value at a given offset \a i.
+				 *
+				 * @param[in] i The index to prefetch.
+				 * @param[in] x The nonzero value array.
+				 *
+				 * This is equivalent to a prefetch hint -- a call to this function may or
+				 * not translate to a no-op.
+				 *
+				 * For sparse vectors to be used in conjunction with #prefetch_assigned.
+				 *
+				 * For <tt>void</tt> nonzero types, translates into a no-op.
+				 *
+				 * \warning Debug-mode assertion allows for out-of-range \a i within the
+				 *          configured prefetch distance.
+				 */
+				template< typename T >
+				inline void prefetch_value(
+					const size_t i,
+					const T *__restrict__ const x
+				) const noexcept {
+					assert( i < _cap + config::PREFETCHING< reference >::distance() );
+					__builtin_prefetch( x + i );
+				}
+
+				/**
+				 * Specialisation for void nonzero element types.
+				 *
+				 * Translates to a no-op.
+				 *
+				 * \warning Debug-mode assertion allows for out-of-range \a i within the
+				 *          configured prefetch distance.
+				 */
+				inline void prefetch_value(
+					const size_t i,
+					const void *__restrict__ const
+				) const noexcept {
+					assert( i < _cap + config::PREFETCHING< reference >::distance() );
+#ifdef NDEBUG
+					(void) i;
+#endif
+				}
+
 				/**
 				 * @returns The value of #assigned(i) interpreted as a mask. If \a descr
 				 *          demands it, the element itself at position \a i, \a val, may
diff --git a/include/graphblas/reference/exec.hpp b/include/graphblas/reference/exec.hpp
index 00bab7f89..d5d705f2c 100644
--- a/include/graphblas/reference/exec.hpp
+++ b/include/graphblas/reference/exec.hpp
@@ -37,71 +37,89 @@ namespace grb {
 	template< EXEC_MODE mode >
 	class Launcher< mode, reference > {
 
-	public:
-		/** This implementation only accepts a single user process. It ignores \a hostname and \a port. */
-		Launcher( const size_t process_id = 0,        // user process ID
-			const size_t nprocs = 1,                  // total number of user processes
-			const std::string hostname = "localhost", // one of the user process hostnames
-			const std::string port = "0"              // a free port at hostname
-		) {
-			// ignore hostname and port
-			(void)hostname;
-			(void)port;
-			// sanity checks
-			if( nprocs != 1 ) {
-				throw std::invalid_argument( "Total number of user processes must be "
-					"exactly one when using the reference implementation."
-				);
+		public:
+
+			/**
+			 * This implementation only accepts a single user process.
+			 * It ignores \a hostname and \a port.
+			 *
+			 * @param[in] process_id The ID of the calling process.
+			 * @param[in] nprocs     The number of calling processes.
+			 * @param[in] hostname   One of the user process host names.
+			 * @param[in] port       A free port at the given host name.
+			 */
+			Launcher(
+				const size_t process_id = 0,
+				const size_t nprocs = 1,
+				const std::string hostname = "localhost",
+				const std::string port = "0"
+			) {
+				// ignore hostname and port
+				(void) hostname;
+				(void) port;
+				// sanity checks
+				if( nprocs != 1 ) {
+					throw std::invalid_argument( "Total number of user processes must be "
+						"exactly one when using the reference implementation."
+					);
+				}
+				if( process_id != 0 ) {
+					throw std::invalid_argument( "Process ID must always be zero in the "
+						"reference implementation."
+					);
+				}
 			}
-			if( process_id != 0 ) {
-				throw std::invalid_argument( "Process ID must always be zero in the "
-					"reference implementation."
-				);
-			}
-		}
-
-		/** No implementation notes. */
-		~Launcher() {}
-
-		/** No implementation notes. */
-		template< typename U >
-		RC exec(
-			void ( *grb_program )( const void *, const size_t, U & ),
-			const void * data_in, const size_t in_size,
-			U &data_out, const bool broadcast = false
-		) const {
-			(void)broadcast; // value doesn't matter for a single user process
-			// intialise GraphBLAS
-			RC ret = grb::init();
-			// call graphBLAS algo
-			if( ret == SUCCESS ) {
-				( *grb_program )( data_in, in_size, data_out );
-				ret = grb::finalize();
+
+			/** No implementation notes. */
+			~Launcher() {}
+
+			/** No implementation notes. */
+			template< typename U >
+			RC exec(
+				void ( *grb_program )( const void *, const size_t, U & ),
+				const void * data_in, const size_t in_size,
+				U &data_out,
+				const bool broadcast = false
+			) const {
+				// value doesn't matter for a single user process
+				(void) broadcast;
+				// check input arguments
+				if( in_size > 0 && data_in == nullptr ) {
+					return ILLEGAL;
+				}
+				// intialise GraphBLAS
+				RC ret = grb::init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					(*grb_program)( data_in, in_size, data_out );
+					ret = grb::finalize();
+				}
+				// and done
+				return ret;
 			}
-			// and done
-			return ret;
-		}
-
-		/** No implementation notes. */
-		template< typename T, typename U >
-		RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
-			const T &data_in, U &data_out,            // input & output data
-			const bool broadcast = false
-		) {
-			(void)broadcast; // value doesn't matter for a single user process
-			// intialise GraphBLAS
-			RC ret = grb::init();
-			// call graphBLAS algo
-			if( ret == SUCCESS ) {
-				( *grb_program )( data_in, data_out );
-				ret = grb::finalize();
+
+			/** No implementation notes. */
+			template< typename T, typename U >
+			RC exec(
+				void ( *grb_program )( const T &, U & ), // user ALP/GraphBLAS program
+				const T &data_in, U &data_out,           // input & output data
+				const bool broadcast = false
+			) {
+				(void) broadcast; // value doesn't matter for a single user process
+				// intialise ALP/GraphBLAS
+				RC ret = grb::init();
+				// call graphBLAS algo
+				if( ret == SUCCESS ) {
+					(*grb_program)( data_in, data_out );
+					ret = grb::finalize();
+				}
+				// and done
+				return ret;
 			}
-			// and done
-			return ret;
-		}
 
-		/** No implementation notes. */
-		grb::RC finalize() { return grb::SUCCESS; }
+			/** No implementation notes. */
+			grb::RC finalize() { return grb::SUCCESS; }
+
 	};
 
 } // namespace grb
diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp
index dbd34c199..d482384b5 100644
--- a/include/graphblas/reference/io.hpp
+++ b/include/graphblas/reference/io.hpp
@@ -367,7 +367,8 @@ namespace grb {
 		typename Coords
 	>
 	RC set(
-		Vector< DataType, reference, Coords > &x, const T val,
+		Vector< DataType, reference, Coords > &x,
+		const T val,
 		const Phase &phase = EXECUTE,
 		const typename std::enable_if<
 			!grb::is_object< DataType >::value &&
@@ -381,6 +382,12 @@ namespace grb {
 			"called with a value type that does not match that of the given vector"
 		);
 
+		// dynamic checks
+		const size_t n = size( x );
+		if( (descr & descriptors::dense) && nnz( x ) < n ) {
+			return ILLEGAL;
+		}
+
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
@@ -390,9 +397,10 @@ namespace grb {
 		const DataType toCopy = static_cast< DataType >( val );
 
 		// make vector dense if it was not already
-		internal::getCoordinates( x ).assignAll();
+		if( !(descr & descriptors::dense) ) {
+			internal::getCoordinates( x ).assignAll();
+		}
 		DataType * const raw = internal::getRaw( x );
-		const size_t n = internal::getCoordinates( x ).size();
 
 #ifdef _H_GRB_REFERENCE_OMP_IO
 		#pragma omp parallel
@@ -460,29 +468,47 @@ namespace grb {
 			"vector"
 		);
 
-		if( phase == RESIZE ) {
-			return SUCCESS;
-		}
-		assert( phase == EXECUTE );
-
 		// catch empty mask
 		if( size( m ) == 0 ) {
-			return set< descr >( x, val );
+			return set< descr >( x, val, phase );
 		}
 
 		// dynamic sanity checks
-		if( size( x ) != size( m ) ) {
+		const size_t sizex = size( x );
+		if( sizex != size( m ) ) {
 			return MISMATCH;
 		}
+		if( (descr & descriptors::dense) &&
+			(nnz( x ) < sizex || nnz( m ) < sizex)
+		) {
+			return ILLEGAL;
+		}
 
-		// pre-cast value to be copied
-		const DataType toCopy = static_cast< DataType >( val );
+		// handle trivial resize
+		if( phase == RESIZE ) {
+			return SUCCESS;
+		}
+		assert( phase == EXECUTE );
 
-		// make vector dense if it was not already
+		// make the vector empty unless the dense descriptor is provided
+		const bool mask_is_dense = (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) && (
+				(descr & descriptors::dense) ||
+				nnz( m ) == sizex
+			);
+		if( !((descr & descriptors::dense) && mask_is_dense) ) {
+			internal::getCoordinates( x ).clear();
+		} else if( mask_is_dense ) {
+			// dispatch to faster variant if mask is structurally dense
+			return set< descr >( x, val, phase );
+		}
+
+		// pre-cast value to be copied and get coordinate handles
+		const DataType toCopy = static_cast< DataType >( val );
 		DataType * const raw = internal::getRaw( x );
-		auto & coors = internal::getCoordinates( x );
-		const auto & m_coors = internal::getCoordinates( m );
-		auto m_p = internal::getRaw( m );
+		auto &coors = internal::getCoordinates( x );
+		const auto &m_coors = internal::getCoordinates( m );
+		const MaskType * const m_p = internal::getRaw( m );
 
 #ifdef _H_GRB_REFERENCE_OMP_IO
 		#pragma omp parallel
@@ -515,18 +541,20 @@ namespace grb {
 				}
 #ifdef _H_GRB_REFERENCE_OMP_IO
 				if( !coors.asyncAssign( index, localUpdate ) ) {
-					(void)++asyncAssigns;
+					(void) ++asyncAssigns;
 				}
 				if( asyncAssigns == maxAsyncAssigns ) {
-					(void)coors.joinUpdate( localUpdate );
+					(void) coors.joinUpdate( localUpdate );
 					asyncAssigns = 0;
 				}
 #else
-				(void)coors.assign( index );
+				(void) coors.assign( index );
 #endif
-				raw[ index ] =
-					internal::ValueOrIndex< descr, DataType, DataType >::getFromScalar(
-						toCopy, index );
+				raw[ index ] = internal::ValueOrIndex<
+						descr, DataType, DataType
+					>::getFromScalar(
+						toCopy, index
+					);
 			}
 #ifdef _H_GRB_REFERENCE_OMP_IO
 			while( !coors.joinUpdate( localUpdate ) ) {}
@@ -565,21 +593,26 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< DataType, T >::value ), "grb::set (Vector, at index)",
+				std::is_same< DataType, T >::value ),
+			"grb::set (Vector, at index)",
 			"called with a value type that does not match that of the given "
-			"vector" );
+			"vector"
+		);
 		if( phase == RESIZE ) {
 			return SUCCESS;
 		}
 		assert( phase == EXECUTE );
 
 		// dynamic sanity checks
-		if( i >= internal::getCoordinates( x ).size() ) {
+		if( i >= size( x ) ) {
 			return MISMATCH;
 		}
+		if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+			return ILLEGAL;
+		}
 
 		// do set
-		(void)internal::getCoordinates( x ).assign( i );
+		(void) internal::getCoordinates( x ).assign( i );
 		internal::getRaw( x )[ i ] = static_cast< DataType >( val );
 
 #ifdef _DEBUG
@@ -617,31 +650,35 @@ namespace grb {
 	) {
 		// static sanity checks
 		NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
-			std::is_same< OutputType, InputType >::value ),
+				std::is_same< OutputType, InputType >::value ),
 			"grb::copy (Vector)",
 			"called with vector parameters whose element data types do not match"
 		);
 		constexpr bool out_is_void = std::is_void< OutputType >::value;
 		constexpr bool in_is_void = std::is_void< OutputType >::value;
-		static_assert( ! in_is_void || out_is_void,
+		static_assert( !in_is_void || out_is_void,
 			"grb::set (reference, vector <- vector, masked): "
 			"if input is void, then the output must be also" );
-		static_assert( ! ( descr & descriptors::use_index ) || ! out_is_void,
+		static_assert( !(descr & descriptors::use_index) || !out_is_void,
 			"grb::set (reference, vector <- vector, masked): "
 			"use_index descriptor cannot be set if output vector is void" );
 
 		// check contract
-		if( size( x ) != size( y ) ) {
+		const size_t n = size( x );
+		if( n != size( y ) ) {
 			return MISMATCH;
 		}
-		if( size( x ) == 0 ) {
+		// check trivial op
+		// note: the below check cannot move after the check that uses getID
+		if( n == 0 ) {
 			return SUCCESS;
 		}
+		// continue contract checks
 		if( getID( x ) == getID( y ) ) {
 			return ILLEGAL;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < size( y ) ) {
+			if( nnz( y ) < size( y ) || nnz( x ) < size( x ) ) {
 				return ILLEGAL;
 			}
 		}
@@ -658,16 +695,33 @@ namespace grb {
 		OutputType * __restrict__ const dst = internal::getRaw( x );
 		const InputType * __restrict__ const src = internal::getRaw( y );
 
+		// make the vector empty unless the dense descriptor is provided
+		if( !(descr & descriptors::dense) ) {
+			internal::getCoordinates( x ).clear();
+		}
+
 		// get #nonzeroes
-		const size_t nz = internal::getCoordinates( y ).nonzeroes();
+		const size_t nz = nnz( y );
 #ifdef _DEBUG
 		std::cout << "grb::set called with source vector containing "
 			<< nz << " nonzeroes." << std::endl;
 #endif
 
+#ifndef NDEBUG
+		if( src == nullptr ) {
+			assert( dst == nullptr );
+		}
+#endif
 		// first copy contents
 		if( src == nullptr && dst == nullptr ) {
-			// then source is a pattern vector, just copy its pattern
+			// if both source and destination are dense void vectors, this is a no-op
+			if( (descr & descriptors::dense) || (
+					nnz( x ) == size( x ) && nz == size( y )
+				)
+			) {
+				return SUCCESS;
+			}
+			// otherwise, copy source nonzero pattern to destination:
 #ifdef _H_GRB_REFERENCE_OMP_IO
 			#pragma omp parallel
 			{
@@ -685,11 +739,11 @@ namespace grb {
 			}
 #endif
 		} else {
-#ifndef NDEBUG
-			if( src == nullptr ) {
-				assert( dst == nullptr );
+			// if the output is a void vector that is furthermore dense, then this is
+			// actually also a no-op:
+			if( (descr & descriptors::dense) && out_is_void ) {
+				return SUCCESS;
 			}
-#endif
 			// otherwise, the regular copy variant:
 #ifdef _H_GRB_REFERENCE_OMP_IO
 			#pragma omp parallel
@@ -701,8 +755,13 @@ namespace grb {
 				const size_t end = nz;
 #endif
 				for( size_t i = start; i < end; ++i ) {
-					const auto index = internal::getCoordinates( x ).asyncCopy(
-						internal::getCoordinates( y ), i );
+					size_t index;
+					if( !(descr & descriptors::dense) ) {
+						index = internal::getCoordinates( x ).asyncCopy(
+							internal::getCoordinates( y ), i );
+					} else {
+						index = i;
+					}
 					if( !out_is_void && !in_is_void ) {
 						dst[ index ] = internal::setIndexOrValue< descr, OutputType >(
 							index, src[ index ] );
@@ -714,7 +773,9 @@ namespace grb {
 		}
 
 		// set number of nonzeroes
-		internal::getCoordinates( x ).joinCopy( internal::getCoordinates( y ) );
+		if( !(descr & descriptors::dense) ) {
+			internal::getCoordinates( x ).joinCopy( internal::getCoordinates( y ) );
+		}
 
 		// done
 		return SUCCESS;
@@ -784,7 +845,10 @@ namespace grb {
 			return ILLEGAL;
 		}
 		if( descr & descriptors::dense ) {
-			if( nnz( y ) < grb::size( y ) || nnz( mask ) < grb::size( mask ) ) {
+			if( nnz( x ) < grb::size( x ) ||
+				nnz( y ) < grb::size( y ) ||
+				nnz( mask ) < grb::size( mask )
+			) {
 				return ILLEGAL;
 			}
 		}
@@ -813,6 +877,16 @@ namespace grb {
 		const auto &y_coors = internal::getCoordinates( y );
 		auto &x_coors = internal::getCoordinates( x );
 
+		// make the vector empty unless the dense descriptor is provided
+		const bool mask_is_dense = (descr & descriptors::structural) &&
+			!(descr & descriptors::invert_mask) && (
+				(descr && descriptors::dense) ||
+				nnz( mask ) == grb::size( mask )
+			);
+		if( !((descr & descriptors::dense) && mask_is_dense) ) {
+			internal::getCoordinates( x ).clear();
+		}
+
 		// choose optimal loop size
 		const bool loop_over_y = (descr & descriptors::invert_mask) ||
 			( y_coors.nonzeroes() < m_coors.nonzeroes() );
@@ -1437,6 +1511,29 @@ namespace grb {
 	/**
 	 * Calls the other #buildMatrixUnique variant.
 	 * @see grb::buildMatrixUnique for the user-level specification.
+	 *
+	 * \parblock
+	 * \par Performance semantics.
+	 *
+	 *        -# This function contains
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ amount of work.
+	 *        -# This function may dynamically allocate
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of memory.
+	 *        -# A call to this function will use \f$ \mathcal{O}(m+n) \f$ bytes
+	 *           of memory beyond the memory in use at the function call entry.
+	 *        -# This function will copy each input forward iterator at most
+	 *           \em once; the three input iterators \a I, \a J, and \a V thus
+	 *           may have exactly one copyeach, meaning that all input may be
+	 *           traversed only once.
+	 *        -# Each of the at most three iterator copies will be incremented
+	 *           at most \f$ \mathit{nz} \f$ times.
+	 *        -# Each position of the each of the at most three iterator copies
+	 *           will be dereferenced exactly once.
+	 *        -# This function moves
+	 *           \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of data.
+	 *        -# This function will likely make system calls.
+	 *
+	 * \endparblock
 	 */
 	template<
 		Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp
index bca71a4b1..3088eab23 100644
--- a/include/graphblas/reference/matrix.hpp
+++ b/include/graphblas/reference/matrix.hpp
@@ -46,8 +46,6 @@
 #include <graphblas/utils/autodeleter.hpp>
 #include <graphblas/utils/DMapper.hpp>
 #include <graphblas/type_traits.hpp>
-
-#include <graphblas/algorithms/hpcg/ndim_matrix_builders.hpp>
 #include <graphblas/utils/iterators/utils.hpp>
 
 #include "NonzeroWrapper.hpp"
@@ -1362,9 +1360,9 @@ namespace grb {
 					sizes[ 3 ] = internal::Coordinates< reference >::bufferSize( cols );
 					sizes[ 4 ] = rows * internal::SizeOf< D >::value;
 					sizes[ 5 ] = cols * internal::SizeOf< D >::value;
+					CRS.getStartAllocSize( &( sizes[ 6 ] ), rows );
+					CCS.getStartAllocSize( &( sizes[ 7 ] ), cols );
 					if( cap_in > 0 ) {
-						CRS.getStartAllocSize( &( sizes[ 6 ] ), rows );
-						CCS.getStartAllocSize( &( sizes[ 7 ] ), cols );
 						CRS.getAllocSize( &(sizes[ 8 ]), cap_in );
 						CCS.getAllocSize( &(sizes[ 10 ]), cap_in );
 					} else {
@@ -1578,7 +1576,7 @@ namespace grb {
 				char * alloc[ 4 ] = { nullptr, nullptr, nullptr, nullptr };
 				size_t sizes[ 4 ];
 				// cache old allocation data
-				size_t old_sizes[ 4 ];
+				size_t old_sizes[ 4 ] = { 0, 0, 0, 0 };
 				size_t freed = 0;
 				if( cap > 0 ) {
 					CRS.getAllocSize( &( old_sizes[ 0 ] ), cap );
@@ -1634,8 +1632,7 @@ namespace grb {
 				return SUCCESS;
 			}
 
-
-			/** 
+			/**
 			 * @see Matrix::buildMatrixUnique.
 			 *
 			 * This dispatcher calls the sequential or the parallel implementation based
@@ -1957,7 +1954,8 @@ namespace grb {
 				Matrix( other.m, other.n, other.cap )
 			{
 #ifdef _DEBUG
-				std::cerr << "In grb::Matrix (reference) copy-constructor\n";
+				std::cerr << "In grb::Matrix (reference) copy-constructor\n"
+					<< "\t source matrix has " << other.nz << " nonzeroes\n";
 #endif
 				nz = other.nz;
 
@@ -1968,7 +1966,7 @@ namespace grb {
 				#pragma omp parallel
 #endif
 				{
-					size_t range = CRS.copyFromRange( nz, n );
+					size_t range = CRS.copyFromRange( nz, m );
 #ifdef _H_GRB_REFERENCE_OMP_MATRIX
 					size_t start, end;
 					config::OMP::localRange( start, end, 0, range );
@@ -1976,14 +1974,14 @@ namespace grb {
 					const size_t start = 0;
 					size_t end = range;
 #endif
-					CRS.copyFrom( other.CRS, nz, n, start, end );
-					range = CCS.copyFromRange( nz, m );
+					CRS.copyFrom( other.CRS, nz, m, start, end );
+					range = CCS.copyFromRange( nz, n );
 #ifdef _H_GRB_REFERENCE_OMP_MATRIX
 					config::OMP::localRange( start, end, 0, range );
 #else
 					end = range;
 #endif
-					CCS.copyFrom( other.CCS, nz, m, start, end );
+					CCS.copyFrom( other.CCS, nz, n, start, end );
 				}
 			}
 
diff --git a/include/graphblas/reference/properties.hpp b/include/graphblas/reference/properties.hpp
index 731a0bcba..39d30ac2a 100644
--- a/include/graphblas/reference/properties.hpp
+++ b/include/graphblas/reference/properties.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Gathers the properties of the reference backends.
+ *
  * @author A. N. Yzelman
  * @date 5th of May 2017
  */
@@ -30,27 +34,45 @@ namespace grb {
 	/** No implementation notes. */
 	template<>
 	class Properties< reference > {
-	public:
+
+		public:
+
 #ifdef _H_GRB_REFERENCE_OMP_PROPERTIES
-		/** No implementation notes. */
-		constexpr static bool writableCaptured = false;
+			/**
+			 * For shared-memory parallelisation using OpenMP, writing to captured
+			 * scalars will lead to race conditions and thus is not supported.
+			 */
+			static constexpr const bool writableCaptured = false;
 #else
-		/** No implementation notes. */
-		constexpr static bool writableCaptured = true;
+			/**
+			 * For sequential computation, writing to captured scalars is fine.
+			 */
+			static constexpr const bool writableCaptured = true;
 #endif
+			/**
+			 * The reference backends implement the blocking mode.
+			 */
+			static constexpr const bool isBlockingExecution = true;
+
+			/**
+			 * The reference backends implement the blocking mode.
+			 */
+			static constexpr const bool isNonblockingExecution = false;
+
 	};
 
 } // namespace grb
 
 // parse this unit again for OpenMP support
 #ifdef _GRB_WITH_OMP
-#ifndef _H_GRB_REFERENCE_OMP_PROPERTIES
-#define _H_GRB_REFERENCE_OMP_PROPERTIES
-#define reference reference_omp
-#include "graphblas/reference/properties.hpp"
-#undef reference
-#undef _H_GRB_REFERENCE_OMP_PROPERTIES
-#endif
+ #ifndef _H_GRB_REFERENCE_OMP_PROPERTIES
+  #define _H_GRB_REFERENCE_OMP_PROPERTIES
+  #define reference reference_omp
+  #include "graphblas/reference/properties.hpp"
+  #undef reference
+  #undef _H_GRB_REFERENCE_OMP_PROPERTIES
+ #endif
 #endif
 
 #endif // end `_H_GRB_REFERENCE_PROPERTIES
+
diff --git a/include/graphblas/reference/vector.hpp b/include/graphblas/reference/vector.hpp
index f8ccc450b..f0db908b2 100644
--- a/include/graphblas/reference/vector.hpp
+++ b/include/graphblas/reference/vector.hpp
@@ -142,7 +142,11 @@ namespace grb {
 
 		template<
 			typename D,
+#ifndef _H_GRB_REFERENCE_OMP_VECTOR
 			Backend backend = config::default_backend
+#else
+			Backend backend
+#endif
 		>
 		grb::Vector<
 			D, backend,
@@ -155,7 +159,11 @@ namespace grb {
 
 		template<
 			typename D,
+#ifndef _H_GRB_REFERENCE_OMP_VECTOR
 			Backend backend = config::default_backend
+#else
+			Backend backend
+#endif
 		>
 		const grb::Vector<
 			D, backend,
@@ -232,6 +240,8 @@ namespace grb {
 
 		friend class PinnedVector< D, reference >;
 
+		friend class PinnedVector< D, nonblocking >;
+
 		friend class PinnedVector< D, BSP1D >;
 
 		template< typename ValueType, Backend backend >
@@ -651,7 +661,8 @@ namespace grb {
 				 * given container. If it is equal, this iterator will be set to its end
 				 * position.
 				 */
-				ConstIterator( const Vector< D, reference, MyCoordinates > &in,
+				ConstIterator(
+					const Vector< D, reference, MyCoordinates > &in,
 					size_t initial = 0,
 					size_t processID = 0, size_t numProcesses = 1
 				) noexcept :
@@ -860,7 +871,8 @@ namespace grb {
 			 */
 			Vector( const Vector< D, reference, MyCoordinates > &x ) : _raw( nullptr ) {
 #ifdef _DEBUG
-				std::cout << "In Vector< reference > copy-constructor\n";
+				std::cout << "In Vector< reference > copy-constructor. Copy source has ID "
+					<< x._id << "\n";
 #endif
 				initialize(
 					nullptr, nullptr, nullptr, false, nullptr,
@@ -885,6 +897,10 @@ namespace grb {
 			 * @see Vector for the user-level specfication.
 			 */
 			Vector( Vector< D, reference, MyCoordinates > &&x ) noexcept {
+#ifdef _DEBUG
+				std::cout << "Vector (reference) move-constructor called. Moving from ID "
+					<< x._id << "\n";
+#endif
 				// copy and move
 				_id = x._id;
 				_remove_id = x._remove_id;
@@ -900,19 +916,50 @@ namespace grb {
 				x._raw = nullptr;
 			}
 
-			/** Copy-constructor. */
+			/**
+			 * Copy-constructor.
+			 *
+			 * A call to this operator has the same performance semantics as a call to
+			 * #grb::set.
+			 *
+			 * \warning Relies on #grb::set. Any errors #grb::set would normally return,
+			 *          will, through this constructor, be thrown as standard C++
+			 *          exceptions instead.
+			 *
+			 * \internal Dispatches to #grb::set.
+			 */
 			Vector< D, reference, MyCoordinates > & operator=(
 				const Vector< D, reference, MyCoordinates > &x
-			) noexcept {
-				Vector< D, reference, MyCoordinates > replace( x );
-				*this = std::move( replace );
+			) {
+#ifdef _DEBUG
+				std::cout << "Vector (reference) copy-assignment called: copy " << x._id
+					<< " into " << _id << "\n";
+#endif
+				if( size( x ) != size( *this ) ) {
+					throw std::invalid_argument( "Can only copy-assign from equal-size vectors" );
+				}
+				const auto rc = set( *this, x );
+				if( rc != grb::SUCCESS ) {
+					throw std::runtime_error( grb::toString( rc ) );
+				}
 				return *this;
 			}
 
-			/** Assign-from-temporary. */
+			/**
+			 * Assign-from-temporary.
+			 *
+			 * A call to this operator has \f$ \mathcal{O}(1) \f$ performance semantics
+			 * in work and intra-process data movement. It has no costs in inter-process
+			 * data movement nor in inter-process synchronisations. No system calls shall
+			 * be made.
+			 */
 			Vector< D, reference, MyCoordinates > & operator=(
 				Vector< D, reference, MyCoordinates > &&x
 			) noexcept {
+#ifdef _DEBUG
+				std::cout << "Vector (reference) move-assignment called: move " << x._id
+					<< " into " << _id << "\n";
+#endif
 				_id = x._id;
 				_remove_id = x._remove_id;
 				_raw = x._raw;
diff --git a/include/graphblas/semiring.hpp b/include/graphblas/semiring.hpp
index 31c42495b..8d428b9b9 100644
--- a/include/graphblas/semiring.hpp
+++ b/include/graphblas/semiring.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Provides an ALP semiring
+ *
  * @author A. N. Yzelman
  * @date 15th of March, 2016
  */
@@ -27,9 +31,7 @@
 #include <graphblas/monoid.hpp>
 #include <graphblas/ops.hpp>
 
-/**
- * The main ALP/GraphBLAS namespace.
- */
+
 namespace grb {
 
 	/**
diff --git a/include/graphblas/spmd.hpp b/include/graphblas/spmd.hpp
index eab1b78b9..88cef92bc 100644
--- a/include/graphblas/spmd.hpp
+++ b/include/graphblas/spmd.hpp
@@ -27,13 +27,19 @@
 #include "base/spmd.hpp"
 
 #ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/spmd.hpp"
+ #include "graphblas/reference/spmd.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/spmd.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/spmd.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/spmd.hpp"
+ #include "graphblas/bsp1d/spmd.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/spmd.hpp"
+ #include "graphblas/banshee/spmd.hpp"
 #endif
 
 // specify default only if requested during compilation
@@ -45,3 +51,4 @@ namespace grb {
 #endif
 
 #endif // end _H_GRB_SPMD
+
diff --git a/include/graphblas/tags.hpp b/include/graphblas/tags.hpp
index a55db6161..07d39eb2e 100644
--- a/include/graphblas/tags.hpp
+++ b/include/graphblas/tags.hpp
@@ -23,6 +23,7 @@
 #ifndef _H_GRB_TAGS
 #define _H_GRB_TAGS
 
+
 namespace grb {
 
 	/**
@@ -63,3 +64,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/type_traits.hpp b/include/graphblas/type_traits.hpp
index 020b885e1..1c7fdbd7e 100644
--- a/include/graphblas/type_traits.hpp
+++ b/include/graphblas/type_traits.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Specifies the ALP algebraic type traits
+ *
  * @author A. N. Yzelman
  * @date 25th of March, 2019
  */
@@ -36,43 +40,74 @@ namespace grb {
 	 * There are only two ALP/GraphBLAS containers:
 	 *  -# grb::Vector, and
 	 *  -# grb::Matrix.
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct is_container {
-		/** Base case: an arbitrary type is not an ALP/GraphBLAS object. */
+
+		/**
+		 * Whether \a T is an ALP/GraphBLAS container.
+		 *
+		 * \internal Base case: an arbitrary type is not an ALP/GraphBLAS object.
+		 */
 		static const constexpr bool value = false;
+
 	};
 
 	/**
 	 * Used to inspect whether a given type is an ALP semiring.
 	 *
 	 * @tparam T The type to inspect.
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct is_semiring {
-		/** Base case: an arbitrary type is not a semiring. */
+
+		/**
+		 * Whether \a T is an ALP semiring.
+		 *
+		 * \internal Base case: an arbitrary type is not a semiring.
+		 */
 		static const constexpr bool value = false;
+
 	};
 
 	/**
 	 * Used to inspect whether a given type is an ALP monoid.
 	 *
 	 * @tparam T The type to inspect.
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct is_monoid {
-		/** Base case: an arbitrary type is not a monoid. */
+
+		/**
+		 * Whether \a T is an ALP monoid.
+		 *
+		 * \internal Base case: an arbitrary type is not an ALP monoid.
+		 */
 		static const constexpr bool value = false;
+
 	};
 
 	/**
 	 * Used to inspect whether a given type is an ALP operator.
 	 *
 	 * @tparam T The type to inspect.
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct is_operator {
-		/** Base case: an arbitrary type is not an operator. */
+
+		/**
+		 * Whether \a T is an ALP operator.
+		 *
+		 * \internal Base case: an arbitrary type is not an ALP operator.
+		 */
 		static const constexpr bool value = false;
 	};
 
@@ -84,10 +119,12 @@ namespace grb {
 	 * An ALP/GraphBLAS object is either an ALP/GraphBLAS container or an ALP
 	 * semiring, monoid, or operator.
 	 *
-	 * @see #is_monoid
-	 * @see #is_semiring
-	 * @see #is_operator
-	 * @see #is_container
+	 * @see #grb::is_monoid
+	 * @see #grb::is_semiring
+	 * @see #grb::is_operator
+	 * @see #grb::is_container
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct is_object {
@@ -102,18 +139,129 @@ namespace grb {
 	};
 
 	/**
-	 * Used to inspect whether a given operator is idempotent.
+	 * Used to inspect whether a given operator or monoid is idempotent.
 	 *
-	 * @tparam T The operator to inspect.
+	 * @tparam T The operator or monoid to inspect.
 	 *
 	 * An example of an idempotent operator is the logical OR,
 	 * #grb::operators::logical_or.
+	 *
+	 * \internal
+	 * Since there are relatively few idempotent operators we rely on explicitly
+	 * overriding the default <tt>false</tt> type trait.
+	 *
+	 * \todo This has the disadvantage that user-defined operators do not easily
+	 *       embed an idempotent trait. This should hence be re-written to use the
+	 *       same mechanism as for #grb::is_associative and #grb::is_idempotent.
+	 * \endinternal
+	 *
+	 * \ingroup typeTraits
 	 */
-	template< typename T >
+	template< typename T, typename = void >
 	struct is_idempotent {
-		static_assert( is_operator< T >::value,
-			"Template argument to grb::is_idempotent must be an operator!" );
+
+		static_assert( is_operator< T >::value || is_monoid< T >::value,
+			"Template argument to grb::is_idempotent must be an operator or a monoid!" );
+
+		/** Wheter \a T is idempotent. */
+		static const constexpr bool value = false;
+
+	};
+
+	/**
+	 * \internal
+	 * Specialisation for ALP monoids.
+	 * \endinternal
+	 *
+	 * \ingroup typeTraits
+	 */
+	template< typename Monoid >
+	struct is_idempotent<
+		Monoid,
+		typename std::enable_if< is_monoid< Monoid >::value, void >::type
+	> {
+		static const constexpr bool value =
+			is_idempotent< typename Monoid::Operator >::value;
+	};
+
+	/**
+	 * Used to inspect whether a given operator or monoid is associative.
+	 *
+	 * \note Monoids are associative by definition, but this type traits is
+	 *       nonetheless defined for them so as to preserve symmetry in the API;
+	 *       see, e.g., #grb::is_commutative or #grb::is_idempotent.
+	 *
+	 * @tparam T The operator or monoid to inspect.
+	 *
+	 * An example of an associative operator is the logical or,
+	 * #grb::operators::logical_or.
+	 *
+	 * \ingroup typeTraits
+	 */
+	template< typename T, typename = void >
+	struct is_associative {
+
+		static_assert( is_operator< T >::value || is_monoid< T >::value,
+			"Template argument should be an ALP binary operator or monoid." );
+
+		/** Whether \a T is associative. */
 		static const constexpr bool value = false;
+
+	};
+
+	/**
+	 * \internal
+	 * Specialisation for ALP monoids.
+	 * \endinternal
+	 *
+	 * \ingroup typeTraits
+	 */
+	template< typename Monoid >
+	struct is_associative<
+		Monoid,
+		typename std::enable_if< is_monoid< Monoid >::value, void >::type
+	> {
+		static_assert( is_associative< typename Monoid::Operator >::value,
+			"Malformed ALP monoid encountered" );
+		static const constexpr bool value = true;
+	};
+
+	/**
+	 * Used to inspect whether a given operator or monoid is commutative.
+	 *
+	 * @tparam T The operator or monoid to inspect.
+	 *
+	 * An example of a commutative operator is numerical addition,
+	 * #grb::operators::add.
+	 *
+	 * \ingroup typeTraits
+	 */
+	template< typename T, typename = void >
+	struct is_commutative {
+
+		static_assert( is_operator< T >::value || is_monoid< T >::value,
+			"Template argument should be an ALP binary operator or monoid." );
+
+		/** Whether \a T is commutative. */
+		static const constexpr bool value = false;
+
+	};
+
+	/**
+	 * \internal
+	 * Specialisation for ALP monoids.
+	 * \endinternal
+	 *
+	 * \ingroup typeTraits
+	 */
+	template< typename Monoid >
+	struct is_commutative<
+		Monoid,
+		typename std::enable_if< is_monoid< Monoid >::value, void >::type
+	> {
+		/** \internal Simply inherit from underlying operator */
+		static const constexpr bool value =
+			is_commutative< typename Monoid::Operator >::value;
 	};
 
 	/**
@@ -124,13 +272,19 @@ namespace grb {
 	 *
 	 * An example of a monoid with an immutable identity is the logical OR,
 	 * #grb::operators::logical_or.
+	 *
+	 * \ingroup typeTraits
 	 */
 	template< typename T >
 	struct has_immutable_nonzeroes {
+
 		static_assert( is_semiring< T >::value,
 			"Template argument to grb::has_immutable_nonzeroes must be a "
 			"semiring!" );
+
+		/** Whether \a T a semiring where nonzeroes are immutable. */
 		static const constexpr bool value = false;
+
 	};
 
 	namespace internal {
@@ -144,6 +298,8 @@ namespace grb {
 		 * An example of an operator that non-trivially may result in a
 		 * no-op is grb::operators::left_assign_if. Such operators must
 		 * overload this internal type trait.
+		 *
+		 * \ingroup typeTraits
 		 */
 		template< typename OP >
 		struct maybe_noop {
diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp
index 35a3f3bf8..c5239afdc 100644
--- a/include/graphblas/utils.hpp
+++ b/include/graphblas/utils.hpp
@@ -15,7 +15,11 @@
  * limitations under the License.
  */
 
-/*
+/**
+ * @file
+ *
+ * Implements some utilities that may be useful outside of the ALP context also.
+ *
  * @author A. N. Yzelman
  * @date 8th of August, 2016
  */
@@ -36,14 +40,17 @@
 namespace grb {
 
 	/**
-	 * Some utility classes used that may be used throughout this GraphBLAS
-	 * implementation.
+	 * Utility functions and classes used throughout the ALP implementation that
+	 * could be useful outside of ALP also.
 	 *
+	 * \internal
 	 * Utilities that rely on external libraries or system calls should \em not be
 	 * added here-- those should reside in their own compilation units so that
 	 * backends can decide on an individual basis whether or not to include them.
-	 * This is especially useful when writing a backend for an architecture without
+	 *
+	 * This is especially important considering backends for architectures without
 	 * extensive coverage of standard extensions or libraries.
+	 * \endinternal
 	 */
 	namespace utils {
 
@@ -240,24 +247,32 @@ namespace grb {
 		}
 
 		/**
-		 * A sizeof that is safe w.r.t. void types.
+		 * A sizeof that is safe with respect to <tt>void</tt> types.
 		 *
+		 * The byte size of <tt>void</tt> types returns zero. The byte size of any
+		 * other type <tt>T</tt> returns <tt>sizeof(T)</tt>.
 		 */
 		template< typename T >
 		class SizeOf {
+
 			public:
+
 				/**
 				 * If \a T is <tt>void</tt>, this value equals 0 and
 				 * equal to <tt>sizeof(T)</tt> otherwise.
 				 */
 				static constexpr const size_t value = sizeof( T );
+
 		};
 
 		/** \internal void-specialisation of the above class */
 		template<>
 		class SizeOf< void > {
+
 			public:
+
 				static constexpr const size_t value = 0;
+
 		};
 
 		/**
diff --git a/include/graphblas/utils/alloc.hpp b/include/graphblas/utils/alloc.hpp
index d673c26ed..5943e5216 100644
--- a/include/graphblas/utils/alloc.hpp
+++ b/include/graphblas/utils/alloc.hpp
@@ -35,29 +35,40 @@
 #include <graphblas/utils/autodeleter.hpp>
 
 namespace grb {
+
 	namespace utils {
+
 		namespace internal {
 
 			// template< enum Backend implementation = config::default_backend >
 			// class Allocator;
 
 		} // namespace internal
+
 	}     // namespace utils
+
 } // namespace grb
 
 // include available allocator implementations:
 #ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/alloc.hpp"
+ #include "graphblas/reference/alloc.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/alloc.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/alloc.hpp"
 #endif
 #ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/alloc.hpp"
+ #include "graphblas/bsp1d/alloc.hpp"
 #endif
 #ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/alloc.hpp"
+ #include "graphblas/banshee/alloc.hpp"
 #endif
 
 // define user API:
 namespace grb {
+
 	namespace utils {
 
 		/**
@@ -146,3 +157,4 @@ namespace grb {
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/utils/config.hpp b/include/graphblas/utils/config.hpp
index d82d760de..00e7b9526 100644
--- a/include/graphblas/utils/config.hpp
+++ b/include/graphblas/utils/config.hpp
@@ -24,20 +24,29 @@
 #define _H_GRB_UTILS_CONFIG
 
 namespace grb {
+
 	namespace config {
+
 		/** Parser defaults. */
 		class PARSER {
-		public:
-			/** The default buffer size. */
-			static constexpr size_t bsize() {
-				return ( 1ul << 20 );
-			} // 1MB
-			/** The read block size. */
-			static constexpr size_t read_bsize() {
-				return ( 1ul << 17 );
-			} // 128kB (SSDs should set this higher(!))
+
+			public:
+
+				/** The default buffer size. */
+				static constexpr size_t bsize() {
+					return ( 1ul << 20 );
+				} // 1MB
+
+				/** The read block size. */
+				static constexpr size_t read_bsize() {
+					return ( 1ul << 17 );
+				} // 128kB (SSDs should set this higher(!))
+
 		};
+
 	} // namespace config
+
 } // namespace grb
 
 #endif
+
diff --git a/include/graphblas/utils/iterators/type_traits.hpp b/include/graphblas/utils/iterators/type_traits.hpp
index 24c36f26d..6b52d9879 100644
--- a/include/graphblas/utils/iterators/type_traits.hpp
+++ b/include/graphblas/utils/iterators/type_traits.hpp
@@ -47,13 +47,13 @@ namespace grb {
 		 */
 		template< typename IterT1, typename... IterTs >
 		class common_iterator_tag {
-	
+
 			public:
-	
+
 				using iterator_category = typename std::iterator_traits< IterT1 >::iterator_category;
-	
+
 		};
-	
+
 		/**
 		 * Selects the common iterator tag from multiple STL-style iterators.
 		 *
@@ -67,22 +67,22 @@ namespace grb {
 		 */
 		template< typename IterT1, typename IterT2, typename... IterTs >
 		class common_iterator_tag< IterT1, IterT2, IterTs... > {
-	
+
 			private:
-	
+
 				using cat1 = typename std::iterator_traits< IterT1 >::iterator_category;
 				using cats =
 					typename common_iterator_tag< IterT2, IterTs... >::iterator_category;
-	
-	
+
+
 			public:
-	
+
 				// STL iterator tags are a hierarchy with std::forward_iterator_tag at the base
 				typedef typename std::conditional<
 						std::is_base_of< cat1, cats >::value,
 						cat1, cats
 					>::type iterator_category;
-	
+
 		};
 
 		/**
diff --git a/include/graphblas/utils/parser/MatrixFileIterator.hpp b/include/graphblas/utils/parser/MatrixFileIterator.hpp
index d01e15527..c089ed313 100644
--- a/include/graphblas/utils/parser/MatrixFileIterator.hpp
+++ b/include/graphblas/utils/parser/MatrixFileIterator.hpp
@@ -558,7 +558,7 @@ namespace grb {
 										}
 										break;
 									} else {
-#ifdef _DEBUG	
+#ifdef _DEBUG
 										T temp = val;
 										converter( temp );
 										std::cout << "\t parsed line ``" << row << " " << col << " " << val << "'', "
diff --git a/include/graphblas/vector.hpp b/include/graphblas/vector.hpp
index 26a88fb00..5ac75b1b1 100644
--- a/include/graphblas/vector.hpp
+++ b/include/graphblas/vector.hpp
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-/*
+/**
  * @author A. N. Yzelman
  * @date 10th of August, 2016
  */
@@ -31,6 +31,12 @@
 #ifdef _GRB_WITH_REFERENCE
  #include <graphblas/reference/vector.hpp>
 #endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include <graphblas/hyperdags/vector.hpp>
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/vector.hpp"
+#endif
 #ifdef _GRB_WITH_LPF
  #include <graphblas/bsp1d/vector.hpp>
 #endif
diff --git a/include/transition/blas_sparse.h b/include/transition/blas_sparse.h
index 598958f76..63839723e 100644
--- a/include/transition/blas_sparse.h
+++ b/include/transition/blas_sparse.h
@@ -37,7 +37,7 @@ extern "C" {
  *
  * See the SparseBLAS paper for the full specification.
  *
- * This implementation at present does not support the #blas_conj_trans type.
+ * This implementation at present does not support <tt>blas_conj_trans</tt>.
  */
 enum blas_trans_type {
 	blas_no_trans = 0,
@@ -269,7 +269,7 @@ int EXTBLAS_dusm_open( const blas_sparse_matrix A );
  * The given matrix must be opened for read-out, and must not have been closed
  * in the mean time.
  *
- * @param[out] val The value of the retrieved nonzero.
+ * @param[out] value The value of the retrieved nonzero.
  * @param[out] row The row coordinate of the retrieved nonzero.
  * @param[out] col The column coordinate of the retrieved nonzero.
  *
diff --git a/include/transition/spblas.h b/include/transition/spblas.h
index 3661297c0..4bcc41fc9 100644
--- a/include/transition/spblas.h
+++ b/include/transition/spblas.h
@@ -84,7 +84,8 @@ void spblas_dcsrgemv(
  *                      Either 'L' or 'U', in the case of 'T' (triangular)
  *                      Either 'N' or 'U' for the diagonal type
  *                      Either 'F' or 'C' (one or zero based indexing)
- * @param[in] indx      The column index of the matrix \f$ A \f$.
+ * @param[in] val       The values of the nonzeroes in \f$ A \f$.
+ * @param[in] indx      The column index of the nonzeroes in \f$ A \f$.
  * @param[in] pntrb     The Compressed Row Storage (CRS) row start array.
  * @param[in] pntre     The array \a pntrb shifted by one.
  * @param[in] b         Pointer to the values of \f$ B \f$.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6ba2932ae..50e731a30 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -120,14 +120,14 @@ write_basic_package_version_file(
 configure_file( ALPGraphBLASConfig.cmake.in ALPGraphBLASConfig.cmake @ONLY )
 
 #install modules for dependencies lookup
-install(FILES "${PROJECT_SOURCE_DIR}/cmake/FindNuma.cmake"
+install( FILES "${PROJECT_SOURCE_DIR}/cmake/FindNuma.cmake"
 		"${PROJECT_SOURCE_DIR}/cmake/FindLibM.cmake"
 		DESTINATION "${CMAKE_CONFIGS_INSTALL_DIR}/modules"
 )
 
 # install custom file and version file
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ALPGraphBLASVersion.cmake"
-			"${CMAKE_CURRENT_BINARY_DIR}/ALPGraphBLASConfig.cmake"
+install( FILES "${CMAKE_CURRENT_BINARY_DIR}/ALPGraphBLASVersion.cmake"
+		"${CMAKE_CURRENT_BINARY_DIR}/ALPGraphBLASConfig.cmake"
 		DESTINATION "${CMAKE_CONFIGS_INSTALL_DIR}"
 )
 
diff --git a/src/graphblas/CMakeLists.txt b/src/graphblas/CMakeLists.txt
index c751a64ab..aea3c6fb9 100644
--- a/src/graphblas/CMakeLists.txt
+++ b/src/graphblas/CMakeLists.txt
@@ -20,7 +20,9 @@
 
 assert_defined_targets( backend_headers_nodefs backend_flags )
 
-assert_valid_variables( ALP_UTILS_LIBRARY_OUTPUT_NAME )
+assert_valid_variables( ALP_UTILS_LIBRARY_OUTPUT_NAME BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	SHMEM_BACKEND_INSTALL_DIR
+)
 
 # convenience variable: a few tests need the path of utility sources
 set( ALP_UTILS_SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/utils"
@@ -65,13 +67,10 @@ install( TARGETS alp_utils_dynamic
 # target to compile all backend libraries
 add_custom_target( libs ALL )
 
-# these sources are common to all backends, although some are in reference
+# the sources common to all single-process (aka shmem) backends
 set( backend_reference_srcs
 	${CMAKE_CURRENT_SOURCE_DIR}/descriptors.cpp
 	${CMAKE_CURRENT_SOURCE_DIR}/rc.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/reference/init.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/reference/config.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/reference/io.cpp
 )
 
 # the only source file common to all BSP-based backends
@@ -79,11 +78,96 @@ set( backend_bsp_srcs
 	${CMAKE_CURRENT_SOURCE_DIR}/bsp/collectives.cpp
 )
 
+# include only selected backends
 if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND )
 	add_subdirectory( reference )
 endif()
 
+if( WITH_HYPERDAGS_BACKEND )
+	add_subdirectory( hyperdags )
+endif()
+
+if( WITH_NONBLOCKING_BACKEND )
+	add_subdirectory( nonblocking )
+endif()
+
 if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
 	add_subdirectory( bsp1d )
 endif()
 
+
+# macro to create a target for the shared memory library, with all minimal properties
+#
+# Parameters:
+#   targetName name of the target
+#   targetType type of library (STATIC or SHARED)
+#   outDir directory to compile the binary to (not install)
+#
+macro( make_reference_target targetName targetType outDir )
+
+	add_library( "${targetName}" "${targetType}"
+		"${backend_reference_srcs}"
+	)
+	set_target_properties( "${targetName}" PROPERTIES
+		OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+	)
+	if( "${targetType}" STREQUAL "SHARED" )
+		set_target_properties( "${targetName}" PROPERTIES
+			SOVERSION "${VERSION}"
+			LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${outDir}"
+		)
+	elseif( "${targetType}" STREQUAL "STATIC" )
+		set_target_properties( "${targetName}" PROPERTIES
+			ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${outDir}"
+		)
+	else()
+		message( FATAL_ERROR "Unknown library type: ${targetType}")
+	endif()
+	target_link_libraries( "${targetName}" PRIVATE backend_flags )
+
+endmacro( make_reference_target )
+
+
+### BINARY-ONLY TARGETS: create the actual library with all code inside
+### but storing only the minimum interface definitions (e.g. no OMP);
+### no default backend is set!
+
+if( WITH_REFERENCE_BACKEND OR WITH_REFERENCE_OMP_BACKEND OR WITH_NONBLOCKING_BACKEND )
+
+	# alias target for basic propagation of headers and definitions
+	# from existing header target, which depends on the enabled backends;
+	# use the most advanced ones, i.e. those of reference_omp if available
+	if( WITH_OMP_BACKEND_HEADERS )
+		set( backend_shmem_base_headers backend_reference_omp_headers)
+	elseif( WITH_REFERENCE_BACKEND_HEADERS )
+		set( backend_shmem_base_headers backend_reference_headers )
+	endif()
+
+	## STATIC
+	make_reference_target( backend_shmem_static STATIC "shmem" )
+	target_link_libraries( backend_shmem_static PRIVATE ${backend_shmem_base_headers} )
+	if( WITH_NONBLOCKING_BACKEND )
+		target_link_libraries( backend_shmem_static PRIVATE ${backend_nonblocking_headers} )
+	endif()
+	# this is the actual binary file, i.e. the one to be installed
+	install( TARGETS backend_shmem_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	## DYNAMIC
+	make_reference_target( backend_shmem_shared SHARED "shmem" )
+	target_link_libraries( backend_shmem_shared PRIVATE ${backend_shmem_base_headers} )
+	if( WITH_NONBLOCKING_BACKEND )
+		target_link_libraries( backend_shmem_shared PRIVATE ${backend_nonblocking_headers} )
+	endif()
+	install( TARGETS backend_shmem_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	add_dependencies( libs backend_shmem_static )
+	add_dependencies( libs backend_shmem_shared )
+
+endif()
+
diff --git a/src/graphblas/bsp1d/CMakeLists.txt b/src/graphblas/bsp1d/CMakeLists.txt
index d688cb2c4..0e62e623c 100644
--- a/src/graphblas/bsp1d/CMakeLists.txt
+++ b/src/graphblas/bsp1d/CMakeLists.txt
@@ -26,7 +26,7 @@ assert_valid_variables( backend_reference_srcs backend_bsp_srcs
 	BACKEND_LIBRARY_OUTPUT_NAME
 	BSP1D_BACKEND_DEFAULT_NAME HYBRID_BACKEND_DEFAULT_NAME
 	BSP1D_BACKEND_INSTALL_DIR HYBRID_BACKEND_INSTALL_DIR
-	INCLUDE_INSTALL_DIR BSP1D_SELECTION_DEFS HYBRID_SELECTION_DEFS
+	BSP1D_SELECTION_DEFS HYBRID_SELECTION_DEFS
 )
 
 assert_defined_targets( backend_flags )
diff --git a/src/graphblas/bsp1d/config.cpp b/src/graphblas/bsp1d/config.cpp
index b3fb6dc5f..5c9890e60 100644
--- a/src/graphblas/bsp1d/config.cpp
+++ b/src/graphblas/bsp1d/config.cpp
@@ -22,59 +22,63 @@
 
 #include <graphblas/bsp1d/config.hpp>
 
+
 using namespace grb;
 
 // whether mode is initialised-- false, on program start
 bool config::IMPLEMENTATION< Backend::BSP1D >::set = false;
 
-// keep unitialised, the below function performs initialisation
+// keep uninitialised, the below function performs initialisation
 config::ALLOC_MODE config::IMPLEMENTATION< Backend::BSP1D >::mode;
 
 // sets the mode
 void config::IMPLEMENTATION< BSP1D >::deduce() noexcept {
-	if( ! set ) {
-		// by default, select the one our backend is using
-		mode = config::IMPLEMENTATION< _GRB_BSP1D_BACKEND >::sharedAllocMode();
-		// get number of hardware threads
-		const int maxthreads = sysconf( _SC_NPROCESSORS_ONLN );
-		// try to get process mask
-		cpu_set_t cpuset;
-		if( sched_getaffinity( 0, sizeof( cpu_set_t ), &cpuset ) != 0 ) {
-			std::cerr << "Info: could not get process mask. Will fall back to " << toString( mode ) << " mode for shared memory segment allocations.";
-		} else {
-			// check if we have all-1 mask
-			bool allOne = true;
-			for( int i = 0; i < maxthreads; ++i ) {
-				if( CPU_ISSET( i, &cpuset ) == 0 ) {
-					// we have a zero, therefore assume we are with multiple processes on a single node
-					allOne = false;
-					// we shall now assume a default allocation is more appropriate.
-					mode = config::IMPLEMENTATION< _GRB_BSP1D_BACKEND >::defaultAllocMode();
-					std::cerr << "Info: process mask is zero at HW thread " << i
-							  << ", we therefore assume multiple GraphBLAS "
-								 "processes are present on this node and thus "
-								 "fall back to "
-							  << toString( mode )
-							  << " mode for shared-memory segment "
-								 "allocations.\n";
-					break;
-				}
-			}
-			if( allOne ) {
-				// we are running one process on this node. We shall therefore use the default allocation scheme.
-				std::cerr << "Info: process mask is all-one, we therefore "
-							 "assume a single GraphBLAS process is present on "
-							 "this node and thus shall use "
-						  << toString( mode ) << " mode for shared-memory segment allocations.\n";
+	assert( !set );
+	// by default, select the one our backend is using
+	mode = config::IMPLEMENTATION< _GRB_BSP1D_BACKEND >::sharedAllocMode();
+	// get number of hardware threads
+	const int maxthreads = sysconf( _SC_NPROCESSORS_ONLN );
+	// try to get process mask
+	cpu_set_t cpuset;
+	if( sched_getaffinity( 0, sizeof( cpu_set_t ), &cpuset ) != 0 ) {
+		std::cerr << "Info: could not get process mask. Will fall back to "
+			<< toString( mode ) << " mode for shared memory segment allocations.";
+	} else {
+		// check if we have all-1 mask
+		bool allOne = true;
+		for( int i = 0; i < maxthreads; ++i ) {
+			if( CPU_ISSET( i, &cpuset ) == 0 ) {
+				// we have a zero, therefore assume we are with multiple processes on a
+				// single node
+				allOne = false;
+				// we shall now assume a default allocation is more appropriate
+				mode = config::IMPLEMENTATION< _GRB_BSP1D_BACKEND >::defaultAllocMode();
+				std::cerr << "Info: process mask is zero at HW thread " << i << ", "
+					<< ", therefore ALP assumes multiple user processes are present on this "
+					<< "node and thus fall back to " << toString( mode ) << " mode for "
+					<< "memory allocations that are potentially touched by multiple "
+					<< "threads.\n";
+				break;
 			}
 		}
-		// done
-		set = true;
+		if( allOne ) {
+			// we are running one process on this node. We shall therefore use the
+			// default allocation scheme.
+			std::cerr << "Info: process mask is all-one, we therefore assume a single "
+				<< "user process is present on this node and thus shall use "
+				<< toString( mode ) << " mode for memory allocations that are potentially "
+				<< "touched by multiple threads.\n";
+		}
 	}
+	// done
+	set = true;
 }
 
 config::ALLOC_MODE config::IMPLEMENTATION< BSP1D >::sharedAllocMode() noexcept {
-	deduce();
+	if( !set ) {
+		deduce();
+	}
 	assert( set );
 	return mode;
 }
+
diff --git a/src/graphblas/hyperdags/CMakeLists.txt b/src/graphblas/hyperdags/CMakeLists.txt
new file mode 100644
index 000000000..8256bb109
--- /dev/null
+++ b/src/graphblas/hyperdags/CMakeLists.txt
@@ -0,0 +1,76 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Creation of all shared-memory backends (reference and reference_omp, a.k.a. OMP),
+# both as static and dynamic library. Any target importing a backend also imports
+# the compiler definition(s) required to set it as default. If a target want to do it
+# manually, the target 'backend_shmem_static' exists with no default backend selection
+# in its compilation interface.
+#
+
+assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
+	HYPERDAGS_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+	HYPERDAGS_BACKEND_DEFAULT_NAME HYPERDAGS_SELECTION_DEFS
+	WITH_HYPERDAGS_USING
+)
+
+assert_defined_targets( backend_flags )
+
+set( backend_hyperdags_srcs
+	"${backend_${WITH_HYPERDAGS_USING}_srcs}"
+	${CMAKE_CURRENT_SOURCE_DIR}/io.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/hyperdags.cpp
+)
+
+if( WITH_HYPERDAGS_BACKEND )
+
+	# static
+	add_library( backend_hyperdags_static STATIC ${backend_hyperdags_srcs} )
+	set_target_properties( backend_hyperdags_static PROPERTIES
+		OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+		ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/hyperdags"
+	)
+	target_link_libraries( backend_hyperdags_static PRIVATE backend_flags )
+	target_link_libraries( backend_hyperdags_static PUBLIC backend_hyperdags_headers )
+	target_compile_definitions( backend_hyperdags_static PUBLIC "${HYPERDAGS_SELECTION_DEFS}" )
+	install( TARGETS backend_hyperdags_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${HYPERDAGS_BACKEND_INSTALL_DIR}"
+	)
+
+	# shared
+	add_library( backend_hyperdags_shared SHARED ${backend_hyperdags_srcs} )
+	set_target_properties( backend_hyperdags_shared PROPERTIES
+		OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
+		SOVERSION "${VERSION}"
+		LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/hyperdags"
+	)
+	target_link_libraries( backend_hyperdags_shared PRIVATE backend_flags )
+	target_link_libraries( backend_hyperdags_shared PUBLIC backend_hyperdags_headers )
+	target_compile_definitions( backend_hyperdags_shared PUBLIC "${HYPERDAGS_SELECTION_DEFS}" )
+	install( TARGETS backend_hyperdags_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${HYPERDAGS_BACKEND_INSTALL_DIR}"
+	)
+
+	# propagate targets and use static as default linkage
+	add_dependencies( libs backend_hyperdags_static backend_hyperdags_shared )
+	add_library( "${HYPERDAGS_BACKEND_DEFAULT_NAME}" ALIAS backend_hyperdags_static )
+
+endif()
+
diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp
new file mode 100644
index 000000000..6000f3af7
--- /dev/null
+++ b/src/graphblas/hyperdags/hyperdags.cpp
@@ -0,0 +1,773 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 1st of February, 2022
+ */
+
+#include <graphblas/hyperdags/hyperdags.hpp>
+
+std::string grb::internal::hyperdags::toString(
+	const enum SourceVertexType type
+) noexcept {
+	switch( type ) {
+
+		case SCALAR:
+			return "input scalar";
+
+		case CONTAINER:
+			return "ALP/GraphBLAS container";
+
+		case ITERATOR:
+			return "input iterator";
+
+		case USER_INT:
+			return "input integer";
+
+	}
+	assert( false );
+	return "unidentified source vertex type";
+}
+
+std::string grb::internal::hyperdags::toString(
+	const enum OutputVertexType type
+) noexcept {
+	switch( type ) {
+
+		case CONTAINER_OUTPUT:
+			return "output container";
+
+	}
+	assert( false );
+	return "unidentified output vertex type";
+}
+
+std::string grb::internal::hyperdags::toString(
+	const enum OperationVertexType type
+) noexcept {
+	switch( type ) {
+
+		case NNZ_VECTOR:
+			return "nnz( vector )";
+
+		case NNZ_MATRIX:
+			return "nnz( matrix )";
+
+		case CLEAR_VECTOR:
+			return "clear( vector )";
+
+		case SET_VECTOR_ELEMENT:
+			return "setElement( vector )";
+
+		case DOT:
+			return "dot( scalar, vector, vector )";
+
+		case SET_USING_VALUE:
+			return "set( vector, scalar )";
+
+		case SET_USING_MASK_AND_VECTOR:
+			return "set( vector, vector, vector )";
+
+		case SET_USING_MASK_AND_SCALAR:
+			return "set( vector, vector, scalar )";
+
+		case SET_FROM_VECTOR:
+			return "set( vector, vector )";
+
+		case ZIP:
+			return "zip( vector, vector, vector )";
+
+		case E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP:
+			return "eWiseApply( vector, vector, vector, op )";
+
+		case FOLDR_VECTOR_SCALAR_MONOID:
+			return "foldr( vector, scalar, monoid )";
+
+		case FOLDR_VECTOR_MASK_SCALAR_MONOID:
+			return "foldr( vector, vector, scalar, monoid )";
+
+		case FOLDL_SCALAR_VECTOR_MONOID:
+			return "foldl( scalar, vector, monoid )";
+
+		case FOLDL_SCALAR_VECTOR_MASK_MONOID:
+			return "foldl( calar, vector, vector, monoid )";
+
+		case EWISELAMBDA:
+			return "eWiseLambda( f, vector )";
+
+		case BUILD_VECTOR:
+			return "buildVector( vector, scalar, scalar, scalar, scalar )";
+
+		case BUILD_VECTOR_WITH_VALUES:
+			return "buildVector( vector, scalar, scalar, scalar, scalar, scalar )";
+
+		case SIZE:
+			return "size( vector )";
+
+		case NROWS:
+			return "nrows( matrix )";
+
+		case NCOLS:
+			return "ncols( matrix )";
+
+		case EWISEAPPLY_VECTOR_ALPHA_BETA_OP:
+			return "eWiseApply( vector, scalar, scalar, operation)";
+
+		case EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP:
+			return "eWiseApply( vector, scalar, vector, operation)";
+
+		case EWISEAPPLY_VECTOR_VECTOR_BETA_OP:
+			return "eWiseApply( vector, vector, scalar, operation)";
+
+		case EWISEAPPLY_VECTOR_VECTOR_VECTOR_OP:
+			return "eWiseApply( vector, vector, vector, operation)";
+
+		case EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP:
+			return "eWiseApply( vector, vector, scalar, scalar, operation)";
+
+		case EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP:
+			return "eWiseApply( vector, vector, scalar, vector, operation)";
+
+		case EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP:
+			return "eWiseApply( vector, vector, vector, scalar, operation)";
+
+		case EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP:
+			return "eWiseApply( vector, vector, vector, vector, operation)";
+
+		case EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID:
+			return "eWiseApply( vector, scalar, scalar, monoid)";
+
+		case EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID:
+			return "eWiseApply( vector, scalar, vector, monoid)";
+
+		case EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID:
+			return "eWiseApply( vector, vector, scalar, monoid)";
+
+		case EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID:
+			return "eWiseApply( vector, vector, vector, monoid)";
+
+		case EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID:
+			return "eWiseApply( vector, vector, scalar, scalar, monoid)";
+
+		case EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID:
+			return "eWiseApply( vector, vector, scalar, vector, monoid)";
+
+		case EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID:
+			return "eWiseApply( vector, vector, vector, scalar, monoid)";
+
+		case EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID:
+			return "eWiseApply( vector, vector, vector, vector, monoid)";
+
+		case EWISE_MUL_ADD:
+			return "eWiseMulAdd( vector, vector, vector, vector, vector, ring )";
+
+		case EWISE_MUL_ADD_FOUR_VECTOR:
+			return "eWiseMulAdd( vector, vector, vector, vector, scalar, ring )";
+
+		case EWISE_MUL_ADD_THREE_VECTOR_ALPHA:
+			return "eWiseMulAdd( vector, scalar, vector, scalar, ring )";
+
+		case EWISE_MUL_ADD_THREE_VECTOR_CHI:
+			return "eWiseMulAdd( vector, vector, scalar, vector, ring )";
+
+		case EWISE_MUL_ADD_FOUR_VECTOR_CHI:
+			return "eWiseMulAdd( vector, vector, vector, scalar, vector, ring )";
+
+		case EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING:
+			return "eWiseMulAdd( vector, vector, vector, scalar, vector, ring )";
+
+		case EWISE_MUL_ADD_THREE_VECTOR_BETA:
+			return "eWiseMulAdd( vector, vector, vector, scalar, scalar, ring )";
+
+		case EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA:
+			return "eWiseMulAdd( vector, vector, vector, scalar, ring )";
+
+		case EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA:
+			return "eWiseMulAdd( vector, vector, scalar, scalar, vector, ring )";
+
+		case EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA:
+			return "eWiseMulAdd( vector, vector, scalar, scalar, scalar, ring )";
+
+		case EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE:
+			return "eWiseApply( matrix, matrix, matrix, scalar, scalar )";
+
+		case EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE:
+			return "eWiseApply( matrix, matrix, matrix, scalar, scalar )";
+
+		case SET_MATRIX_MATRIX:
+			return "set( matrix, matrix )";
+
+		case SET_MATRIX_MATRIX_INPUT2:
+			return "set( matrix, matrix, scalar )";
+
+		case MXM_MATRIX_MATRIX_MATRIX_MONOID:
+			return "mxm( matrix, matrix, matrix, monoid, scalar, scalar )";
+
+		case OUTER:
+			return "outer( matrix, vector, vector, scalar, scalar )";
+
+		case MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R:
+			return "mxv( vector, vector, matrix, vector, vector, ring )";
+
+		case ZIP_MATRIX_VECTOR_VECTOR_VECTOR:
+			return "zip( matrix, vector, vector, vector )";
+
+		case ZIP_MATRIX_VECTOR_VECTOR:
+			return "zip( matrix, vector, vector )";
+
+		case UNZIP_VECTOR_VECTOR_VECTOR:
+			return "unzip( matrix, vector, vector )";
+
+		case EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING:
+			return "eWiseMulAdd( vector, vector, vector, scalar, ring )";
+
+		case EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING:
+			return "eWiseMulAdd( vector, vector, scalar, scalar, ring )";
+
+		case EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING:
+			return "eWiseMulAdd( vector, vector, scalar, ring )";
+
+		case EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING:
+			return "eWiseMulAdd( vector, scalar, scalar, vector, ring )";
+
+		case EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING:
+			return "eWiseMulAdd( vector, scalar, scalar, scalar, ring )";
+
+		case EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING:
+			return "eWiseMulAdd( vector, vector, vector, vector, ring )";
+
+		case VXM_VECTOR_VECTOR_VECTOR_MATRIX:
+			return "vxm( vector, vector, vector, matrix, ring )";
+
+		case VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL:
+			return "vxm( vector, vector, vector, matrix, scalar, scalar )";
+
+		case VXM_VECTOR_VECTOR_MATRIX_RING:
+			return "vxm( vector, vector, matrix, ring )";
+
+		case MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING:
+			return "mxv( vector, vector, matrix, vector, ring )";
+
+		case MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A:
+			return "mxv( vector, vector, matrix, vector, vector, scalar, scalar )";
+
+		case MXV_VECTOR_MATRIX_VECTOR_RING:
+			return "mxv( vector, matrix, vector, ring )";
+
+		case MXV_VECTOR_MATRIX_VECTOR_ADD_MUL:
+			return "mxv( vector, matrix, vector, scalar, scalar )";
+
+		case EWISELAMBDA_FUNC_MATRIX:
+			return "eWiseLambda( function, matrix )";
+
+		case VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL:
+			return "vxm( vector, vector, vector, vector, matrix, scalar, scalar )";
+
+		case VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL:
+			return "vxm( vector, vector, vector, matrix, scalar, scalar )";
+
+		case VXM_VECTOR_VECTOR_MATRIX_ADD_MUL:
+			return "vxm( vector, vector, matrix, scalar, scalar )";
+
+		case FOLDL_VECTOR_BETA_OP:
+			return "foldl( vector, scalar, scalar )";
+
+		case FOLDL_VECTOR_VECTOR_BETA_OP:
+			return "foldl( vector, vector, scalar, scalar )";
+
+		case FOLDL_VECTOR_BETA_MONOID:
+			return "foldl( vector, scalar, monoid)";
+
+		case FOLDL_VECTOR_VECTOR_BETA_MONOID:
+			return "foldl( vector, vector, scalar, monoid)";
+
+		case FOLDL_VECTOR_VECTOR_MONOID:
+			return "foldl( vector, vector, monoid)";
+
+		case FOLDL_VECTOR_VECTOR_VECTOR_MONOID:
+			return "foldl( vector, vector, vector, monoid)";
+
+		case FOLDL_VECTOR_VECTOR_VECTOR_OP:
+			return "foldl( vector, vector, vecotr, scalar )";
+
+		case FOLDL_VECTOR_VECTOR_OP:
+			return "foldl( vector, vector, scalar )";
+
+		case FOLDR_APLHA_VECTOR_MONOID:
+			return "foldr( scalar, vector, monoid)";
+
+		case FOLDR_APLHA_VECTOR_OPERATOR:
+			return "foldr( scalar, vector, scalar )";
+
+		case FOLDR_VECTOR_VECTOR_OPERATOR:
+			return "foldr( vector, vector, scalar )";
+
+		case FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR:
+			return "foldr( vector, vector, vector, scalar )";
+
+		case FOLDR_VECTOR_VECTOR_MONOID:
+			return "foldr( vector, vector, monoid)";
+
+		case FOLDR_VECTOR_VECTOR_VECTOR_MONOID:
+			return "foldr( vector, vector, vector, monoid)";
+
+		case EWISEMUL_VECTOR_VECTOR_VECTOR_RING:
+			return "eWiseMul( vector, vector, vector )";
+
+		case EWISEMUL_VECTOR_ALPHA_VECTOR_RING:
+			return "eWiseMul( vector, scalar, vector )";
+
+		case EWISEMUL_VECTOR_VECTOR_BETA_RING:
+			return "eWiseMul( vector, vector, scalar )";
+
+		case EWISEMUL_VECTOR_ALPHA_BETA_RING:
+			return "eWiseMul( vector, scalar, scalar )";
+
+		case EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING:
+			return "eWiseMul( vector, vector, vector, vector )";
+
+		case EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING:
+			return "eWiseMul( vector, vector, scalar, vector )";
+
+		case EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING:
+			return "eWiseMul( vector, vector, vector, scalar )";
+
+		case EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING:
+			return "eWiseMul( vector, vector, scalar, scalar )";
+
+		case EWISELAMBDA_FUNC_VECTOR:
+			return "eWiseLambda( function, vector )";
+
+		case MXM_MATRIX_MATRIX_MATRIX_SEMIRING:
+			return "mxm( matrix, matrix, matrix, semiring, scalar )";
+
+		case CLEAR_MATRIX:
+			return "clear( matrix )";
+
+		case BUILDMATRIXUNIQUE_MATRIX_START_END_MODE:
+			return "buildMatrixUnique( matrix, scalar, scalar, scalar )";
+
+		case CAPACITY_VECTOR:
+			return "capacity( vector )";
+
+		case CAPACITY_MATRIX:
+			return "capacity( matrix )";
+
+		case RESIZE:
+			return "resize( vector, scalar )";
+
+		case RESIZE_MATRIX:
+			return "resize( matrix, scalar )";
+
+		case GETID_VECTOR:
+			return "getID( vector )";
+
+		case GETID_MATRIX:
+			return "getID( matrix )";
+
+	}
+	assert( false );
+	return "unknown operation";
+}
+
+grb::internal::hyperdags::SourceVertex::SourceVertex(
+	const enum grb::internal::hyperdags::SourceVertexType _type,
+	const size_t _local_id, const size_t _global_id
+) noexcept : type( _type ), local_id( _local_id ), global_id( _global_id ) {}
+
+enum grb::internal::hyperdags::SourceVertexType
+grb::internal::hyperdags::SourceVertex::getType() const noexcept {
+	return type;
+}
+
+size_t grb::internal::hyperdags::SourceVertex::getLocalID() const noexcept {
+	return local_id;
+}
+
+size_t grb::internal::hyperdags::SourceVertex::getGlobalID() const noexcept {
+	return global_id;
+}
+
+grb::internal::hyperdags::SourceVertexGenerator::SourceVertexGenerator() {
+	for( size_t i = 0; i < numSourceVertexTypes; ++i ) {
+		nextID[ allSourceVertexTypes[ i ] ] = 0;
+	}
+}
+
+grb::internal::hyperdags::SourceVertex
+grb::internal::hyperdags::SourceVertexGenerator::create(
+	const grb::internal::hyperdags::SourceVertexType type,
+	const size_t global_id
+) {
+	const size_t local_id = (nextID[ type ])++;
+	grb::internal::hyperdags::SourceVertex ret( type, local_id, global_id );
+	return ret;
+}
+
+size_t grb::internal::hyperdags::SourceVertexGenerator::size() const {
+	size_t ret = 0;
+	for( auto &pair : nextID ) {
+		ret += pair.second;
+	}
+	return ret;
+}
+
+grb::internal::hyperdags::OutputVertex::OutputVertex(
+	const size_t _lid,
+	const size_t _gid
+) noexcept : local_id( _lid ), global_id( _gid ) {
+	type = OutputVertexType::CONTAINER_OUTPUT;
+}
+
+enum grb::internal::hyperdags::OutputVertexType
+grb::internal::hyperdags::OutputVertex::getType() const noexcept {
+	return type;
+}
+
+size_t grb::internal::hyperdags::OutputVertex::getLocalID() const noexcept {
+	return local_id;
+}
+
+size_t grb::internal::hyperdags::OutputVertex::getGlobalID() const noexcept {
+	return global_id;
+}
+
+grb::internal::hyperdags::OutputVertexGenerator::OutputVertexGenerator()
+	noexcept : nextID( 0 )
+{}
+
+grb::internal::hyperdags::OutputVertex
+grb::internal::hyperdags::OutputVertexGenerator::create(
+	const size_t global_id
+) {
+	grb::internal::hyperdags::OutputVertex ret( nextID++, global_id );
+	return ret;
+}
+
+size_t grb::internal::hyperdags::OutputVertexGenerator::size() const noexcept {
+	return nextID;
+}
+
+grb::internal::hyperdags::OperationVertex::OperationVertex(
+	const enum grb::internal::hyperdags::OperationVertexType _type,
+	const size_t _lid, const size_t _gid
+) noexcept : type( _type ), local_id( _lid ), global_id( _gid ) {}
+
+enum grb::internal::hyperdags::OperationVertexType
+grb::internal::hyperdags::OperationVertex::getType() const noexcept {
+	return type;
+}
+
+size_t grb::internal::hyperdags::OperationVertex::getLocalID() const noexcept {
+	return local_id;
+}
+
+size_t grb::internal::hyperdags::OperationVertex::getGlobalID() const noexcept {
+	return global_id;
+}
+
+grb::internal::hyperdags::OperationVertexGenerator::OperationVertexGenerator() {
+	for( size_t i = 0; i < numOperationVertexTypes; ++i ) {
+		nextID[ allOperationVertexTypes[ i ] ] = 0;
+	}
+}
+
+grb::internal::hyperdags::OperationVertex
+grb::internal::hyperdags::OperationVertexGenerator::create(
+	const grb::internal::hyperdags::OperationVertexType type,
+	const size_t global_id
+) {
+	const size_t local_id = (nextID[ type ])++;
+	grb::internal::hyperdags::OperationVertex ret( type, local_id, global_id );
+	return ret;
+}
+
+size_t grb::internal::hyperdags::OperationVertexGenerator::size() const {
+	size_t ret = 0;
+	for( auto &pair : nextID ) {
+		ret += pair.second;
+	}
+	return ret;
+}
+
+grb::internal::hyperdags::DHypergraph::DHypergraph() noexcept :
+	num_vertices( 0 ), num_pins( 0 )
+{}
+
+size_t grb::internal::hyperdags::DHypergraph::createVertex() noexcept {
+	return num_vertices++;
+}
+
+size_t grb::internal::hyperdags::DHypergraph::numVertices() const noexcept {
+	return num_vertices;
+}
+
+size_t grb::internal::hyperdags::DHypergraph::numHyperedges() const noexcept {
+	return hyperedges.size();
+}
+
+size_t grb::internal::hyperdags::DHypergraph::numPins() const noexcept {
+	return num_pins;
+}
+
+void grb::internal::hyperdags::DHypergraph::render(
+	std::ostream &out
+) const {
+	size_t net_num = 0;
+	for( const auto &pair : hyperedges ) {
+		out << net_num << " " << pair.first << "\n";
+		const std::set< size_t > &net = pair.second;
+		for( const auto &id : net ) {
+			out << net_num << " " << id << "\n";
+		}
+		(void) ++net_num;
+	}
+	out << std::flush;
+}
+
+grb::internal::hyperdags::HyperDAG::HyperDAG(
+	grb::internal::hyperdags::DHypergraph _hypergraph,
+	const std::vector< grb::internal::hyperdags::SourceVertex > &_srcVec,
+	const std::vector< grb::internal::hyperdags::OperationVertex > &_opVec,
+	const std::vector< grb::internal::hyperdags::OutputVertex > &_outVec
+) : hypergraph( _hypergraph ),
+	num_sources( 0 ), num_operations( 0 ), num_outputs( 0 ),
+	sourceVertices( _srcVec ),
+	operationVertices( _opVec ),
+	outputVertices( _outVec )
+{
+	// first add sources
+	for( const auto &src : sourceVertices ) {
+		const size_t local_id = src.getLocalID();
+		const size_t global_id = src.getGlobalID();
+		source_to_global_id[ local_id ] = global_id;
+		global_to_type[ global_id ] = SOURCE;
+		global_to_local_id[ global_id ] = local_id;
+		(void) ++num_sources;
+	}
+
+	// second, add operations
+	for( const auto &op : operationVertices ) {
+		const size_t local_id = op.getLocalID();
+		const size_t global_id = op.getGlobalID();
+		operation_to_global_id[ local_id ] = global_id;
+		global_to_type[ global_id ] = OPERATION;
+		global_to_local_id[ global_id ] = local_id;
+		(void) ++num_operations;
+	}
+
+	// third, add outputs
+	for( const auto &out : outputVertices ) {
+		const size_t local_id = out.getLocalID();
+		const size_t global_id = out.getGlobalID();
+		output_to_global_id[ local_id ] = global_id;
+		global_to_type[ global_id ] = OUTPUT;
+		global_to_local_id[ global_id ] = local_id;
+		(void) ++num_outputs;
+	}
+
+	// final sanity check
+	assert(
+		num_sources + num_operations + num_outputs == hypergraph.numVertices()
+	);
+}
+
+grb::internal::hyperdags::DHypergraph
+grb::internal::hyperdags::HyperDAG::get() const noexcept {
+	return hypergraph;
+}
+
+size_t grb::internal::hyperdags::HyperDAG::numSources() const noexcept {
+	return num_sources;
+}
+
+size_t grb::internal::hyperdags::HyperDAG::numOperations() const noexcept {
+	return num_operations;
+}
+
+size_t grb::internal::hyperdags::HyperDAG::numOutputs() const noexcept {
+	return num_outputs;
+}
+
+std::vector< grb::internal::hyperdags::SourceVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::sourcesBegin() const {
+	return sourceVertices.cbegin();
+}
+
+std::vector< grb::internal::hyperdags::SourceVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::sourcesEnd() const {
+	return sourceVertices.cend();
+}
+
+std::vector< grb::internal::hyperdags::OperationVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::operationsBegin() const {
+	return operationVertices.cbegin();
+}
+
+std::vector< grb::internal::hyperdags::OperationVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::operationsEnd() const {
+	return operationVertices.cend();
+}
+
+std::vector< grb::internal::hyperdags::OutputVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::outputsBegin() const {
+	return outputVertices.cbegin();
+}
+
+std::vector< grb::internal::hyperdags::OutputVertex >::const_iterator
+grb::internal::hyperdags::HyperDAG::outputsEnd() const {
+	return outputVertices.cend();
+}
+
+grb::internal::hyperdags::HyperDAGGenerator::HyperDAGGenerator() noexcept {}
+
+void grb::internal::hyperdags::HyperDAGGenerator::addContainer(
+	const uintptr_t id
+) {
+	(void) addAnySource(
+		grb::internal::hyperdags::SourceVertexType::CONTAINER,
+		nullptr,
+		id
+	);
+}
+
+void grb::internal::hyperdags::HyperDAGGenerator::addSource(
+	const enum grb::internal::hyperdags::SourceVertexType type,
+	const void * const pointer
+) {
+	assert( type != grb::internal::hyperdags::SourceVertexType::CONTAINER );
+	(void) addAnySource( type, pointer, 0 );
+}
+
+size_t grb::internal::hyperdags::HyperDAGGenerator::addAnySource(
+	const enum grb::internal::hyperdags::SourceVertexType type,
+	const void * const pointer,
+	const uintptr_t id
+) {
+	size_t global_id;
+	if( type == CONTAINER ) {
+#ifdef _DEBUG
+		std::cerr << "\t entering HyperDAGGen::addAnySource for container " << id
+			<< " and type " << toString( type ) << ". "
+			<< "Current source pointers:\n";
+		for( const auto &pair : sourceVerticesP ) {
+			std::cerr << "\t\t " << pair.first << "\n";
+		}
+		std::cerr << "Current source containers:\n";
+		for( const auto &pair : sourceVerticesC ) {
+			std::cerr << "\t\t " << pair.first << "\n";
+		}
+#endif
+		const auto &find = sourceVerticesC.find( id );
+		if( find != sourceVerticesC.end() ) {
+#ifdef _DEBUG
+			std::cerr << "\t\t entry already existed, removing it\n";
+#endif
+			sourceVerticesC.erase( find );
+		}
+		assert( sourceVerticesC.find( id ) == sourceVerticesC.end() );
+		global_id = hypergraph.createVertex();
+		const auto sourceVertex = sourceGen.create( type, global_id );
+#ifdef _DEBUG
+		std::cerr << "\t\t created a source vertex with global ID " << global_id
+			<< " and local ID " << sourceVertex.getLocalID()
+			<< " that will be associated to the unique identifier "
+			<< id << "\n";
+#endif
+		assert( sourceVertex.getGlobalID() == global_id );
+		sourceVerticesC.insert( std::make_pair( id, sourceVertex ) );
+		sourceVec.push_back( sourceVertex );
+	} else {
+		assert( type != CONTAINER );
+#ifdef _DEBUG
+		std::cerr << "\t entering HyperDAGGen::addAnySource for auxiliary data at "
+			<< pointer << " of type " << toString( type ) << ". "
+			<< "Current source pointers:\n";
+		for( const auto &pair : sourceVerticesP ) {
+			std::cerr << "\t\t " << pair.first << "\n";
+		}
+		std::cerr << "Current source containers:\n";
+		for( const auto &pair : sourceVerticesC ) {
+			std::cerr << "\t\t " << pair.first << "\n";
+		}
+#endif
+		const auto &find = sourceVerticesP.find( pointer );
+		if( find != sourceVerticesP.end() ) {
+#ifdef _DEBUG
+			std::cerr << "\t\t entry already existed, removing it\n";
+#endif
+			sourceVerticesP.erase( find );
+		}
+		assert( sourceVerticesP.find( pointer ) == sourceVerticesP.end() );
+		global_id = hypergraph.createVertex();
+		const auto sourceVertex = sourceGen.create( type, global_id );
+#ifdef _DEBUG
+		std::cerr << "\t\t created a source vertex with global ID " << global_id
+			<< " and local ID " << sourceVertex.getLocalID()
+			<< " that will be associated to the unique identifier "
+			<< pointer << "\n";
+#endif
+		assert( sourceVertex.getGlobalID() == global_id );
+		sourceVerticesP.insert( std::make_pair( pointer, sourceVertex ) );
+		sourceVec.push_back( sourceVertex );
+	}
+#ifdef _DEBUG
+	std::cerr << "\t\t Sizes of sourceVertices and sourceVec: "
+		<< sourceVerticesP.size() << " + "
+		<< sourceVerticesC.size() << ", resp., "
+		<< sourceVec.size() << ".\n\t\t Contents of sourceVertices:\n";
+	for( const auto &pair : sourceVerticesP ) {
+		std::cerr << "\t\t\t " << pair.first << "\n";
+	}
+	for( const auto &pair : sourceVerticesC ) {
+		std::cerr << "\t\t\t " << pair.first << "\n";
+	}
+#endif
+	return global_id;
+}
+
+grb::internal::hyperdags::HyperDAG
+grb::internal::hyperdags::HyperDAGGenerator::finalize() const {
+#ifdef _DEBUG
+	std::cout << "HyperDAGGenerator::finalize called.\n"
+		<< "\t there are presently "
+		<< operationOrOutputVertices.size()
+		<< "output vertices\n";
+#endif
+	std::vector< grb::internal::hyperdags::OutputVertex > outputVec;
+
+	// generate outputVertices
+	{
+		grb::internal::hyperdags::OutputVertexGenerator outputGen;
+		for( const auto &pair : operationOrOutputVertices ) {
+			grb::internal::hyperdags::OutputVertex toAdd =
+				outputGen.create( pair.second.first );
+			outputVec.push_back( toAdd );
+		}
+	}
+
+	// generate HyperDAG
+	grb::internal::hyperdags::HyperDAG ret(
+		hypergraph,
+		sourceVec, operationVec, outputVec
+	);
+
+	// done
+	return ret;
+}
+
diff --git a/src/graphblas/hyperdags/init.cpp b/src/graphblas/hyperdags/init.cpp
new file mode 100644
index 000000000..8a248be22
--- /dev/null
+++ b/src/graphblas/hyperdags/init.cpp
@@ -0,0 +1,159 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#include <graphblas/init.hpp>
+
+#include <graphblas/hyperdags/hyperdags.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		namespace hyperdags {
+
+			HyperDAGGenerator generator;
+
+		}
+
+	}
+
+}
+
+template<>
+grb::RC grb::init< grb::hyperdags >(
+	const size_t s, const size_t P, void * const
+) {
+	std::cerr << "Info: grb::init (hyperdags) called.\n";
+	return grb::init< grb::_GRB_WITH_HYPERDAGS_USING >( s, P, nullptr );
+}
+
+static size_t src2int( const grb::internal::hyperdags::SourceVertexType type ) {
+	return static_cast< size_t >( type );
+}
+
+static size_t op2int( const grb::internal::hyperdags::OperationVertexType type ) {
+	size_t ret = static_cast< size_t >( type );
+	ret += grb::internal::hyperdags::numSourceVertexTypes;
+	return ret;
+}
+
+static size_t out2int( const grb::internal::hyperdags::OutputVertexType type ) {
+	size_t ret = static_cast< size_t >( type );
+	ret += grb::internal::hyperdags::numSourceVertexTypes;
+	ret += grb::internal::hyperdags::numOperationVertexTypes;
+	return ret;
+}
+
+template<>
+grb::RC grb::finalize< grb::hyperdags >() {
+	std::cerr << "Info: grb::finalize (hyperdags) called.\n";
+	std::cerr << "\t dumping HyperDAG to stdout" << std::endl;
+	const grb::internal::hyperdags::HyperDAG &hyperdag =
+		grb::internal::hyperdags::generator.finalize();
+	const grb::internal::hyperdags::DHypergraph &hypergraph =
+		hyperdag.get();
+	std::ostream &ostream = std::cout;
+	ostream << "%%MatrixMarket weighted-matrix coordinate pattern general\n";
+	// print source vertex types as comments
+	{
+		std::set< grb::internal::hyperdags::SourceVertexType > present;
+		for( auto it = hyperdag.sourcesBegin(); it != hyperdag.sourcesEnd(); ++it ) {
+			present.insert( it->getType() );
+		}
+		ostream << "%\t There are " << present.size() << " "
+			<< "unique source vertex types present in this graph. "
+			<< "An index of source type ID and their description follows:\n";
+		for( const auto &type : present ) {
+			ostream << "%\t\t " << src2int( type ) << ": " << toString( type ) << "\n";
+		}
+	}
+	// print operation vertex types as comments
+	{
+		std::set< grb::internal::hyperdags::OperationVertexType > present;
+		for(
+			auto it = hyperdag.operationsBegin();
+			it != hyperdag.operationsEnd();
+			++it
+		) {
+			present.insert( it->getType() );
+		}
+		ostream << "%\t There are " << present.size() << " "
+			<< "unique operation vertex types present in this graph. "
+			<< "An index of vertex type ID and their description follows:\n";
+		for( const auto &type : present ) {
+			ostream << "%\t\t " << op2int( type ) << ": " << toString( type ) << "\n";
+		}
+	}
+	// print output vertex types as comments
+	{
+		std::set< grb::internal::hyperdags::OutputVertexType > present;
+		for( auto it = hyperdag.outputsBegin(); it != hyperdag.outputsEnd(); ++it ) {
+			present.insert( it->getType() );
+		}
+		ostream << "%\t There are " << present.size() << " "
+			<< "unique output vertex types present in this graph. "
+			<< "An index of output vertex type ID and their description follows:\n";
+		for( const auto &type : present ) {
+			ostream << "%\t\t " << out2int( type ) << ": " << toString( type ) << "\n";
+		}
+	}
+	// print HyperDAG size
+	const size_t numEdges = hypergraph.numHyperedges();
+	ostream << numEdges << " " << hypergraph.numVertices() << " "
+		<< hypergraph.numPins() << "\n";
+	// print all hyperedge IDs
+	for( size_t i = 0; i < numEdges; ++i ) {
+		ostream << i << " % no additional data on hyperedges at present\n";
+	}
+	// print all vertex IDs, their types, and their local IDs
+	for( auto it = hyperdag.sourcesBegin(); it != hyperdag.sourcesEnd(); ++it ) {
+		ostream << it->getGlobalID() << " " << src2int( it->getType() ) << " "
+			<< it->getLocalID() << " "
+			<< "% source vertex of type "
+			<< grb::internal::hyperdags::toString( it->getType() ) << " no. "
+			<< it->getLocalID() << "\n";
+	}
+	for(
+		auto it = hyperdag.operationsBegin();
+		it != hyperdag.operationsEnd();
+		++it
+	) {
+		ostream << it->getGlobalID() << " " << op2int( it->getType() ) << " "
+			<< it->getLocalID() << " "
+			<< "% operation vertex of type "
+			<< grb::internal::hyperdags::toString( it->getType() ) << " "
+			<< "no. " << it->getLocalID() << "\n";
+	}
+	for( auto it = hyperdag.outputsBegin(); it != hyperdag.outputsEnd(); ++it ) {
+		ostream << it->getGlobalID() << " " << out2int( it->getType() ) << " "
+			<< it->getLocalID() << " "
+			<< "% output vertex of type "
+			<< grb::internal::hyperdags::toString( it->getType() ) << " "
+			<< "no. " << it->getLocalID() << "\n";
+	}
+	// print HyperDAG structure
+	hypergraph.render( ostream );
+	// done
+	return grb::finalize< grb::_GRB_WITH_HYPERDAGS_USING >();
+}
+
diff --git a/src/graphblas/hyperdags/io.cpp b/src/graphblas/hyperdags/io.cpp
new file mode 100644
index 000000000..07d5aba59
--- /dev/null
+++ b/src/graphblas/hyperdags/io.cpp
@@ -0,0 +1,37 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @author A. N. Yzelman
+ * @date 21st of September, 2022
+ */
+
+#include <graphblas.hpp>
+
+
+namespace grb {
+
+	/**
+	 * \internal This is a blocking implementation, so wait is a no-op.
+	 */
+	template<>
+	RC wait< hyperdags >() {
+		return SUCCESS;
+	}
+
+}
+
diff --git a/src/graphblas/nonblocking/CMakeLists.txt b/src/graphblas/nonblocking/CMakeLists.txt
new file mode 100644
index 000000000..46cde9de1
--- /dev/null
+++ b/src/graphblas/nonblocking/CMakeLists.txt
@@ -0,0 +1,39 @@
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Creation of the nonblocking backend, both as static and dynamic library. Any target
+# importing a backend also imports the compiler definition(s) required to set it as
+# default. If a target want to do it manually, the target 'backend_shmem_static' exists
+# with no default backend selection in its compilation interface.
+#
+
+assert_valid_variables( SHMEM_BACKEND_INSTALL_DIR
+	NONBLOCKING_BACKEND_DEFAULT_NAME NONBLOCKING_SELECTION_DEFS
+)
+
+assert_defined_variables( backend_reference_srcs )
+
+
+set( backend_reference_srcs ${backend_reference_srcs}
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/analytic_model.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/lazy_evaluation.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/io.cpp
+	PARENT_SCOPE
+)
+
diff --git a/src/graphblas/nonblocking/analytic_model.cpp b/src/graphblas/nonblocking/analytic_model.cpp
new file mode 100644
index 000000000..667013149
--- /dev/null
+++ b/src/graphblas/nonblocking/analytic_model.cpp
@@ -0,0 +1,120 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the analytic model used during nonblocking execution.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#include <graphblas/nonblocking/init.hpp>
+#include <graphblas/nonblocking/analytic_model.hpp>
+
+
+using namespace grb::internal;
+
+AnalyticModel::AnalyticModel() noexcept {}
+
+AnalyticModel::AnalyticModel(
+	const size_t data_type_size,
+	const size_t vector_size,
+	const size_t accessed_vectors
+) noexcept :
+	size_of_data_type( data_type_size ),
+	size_of_vector( vector_size ),
+	num_accessed_vectors( accessed_vectors )
+{
+	size_t tile_size_estimation;
+
+	num_threads = grb::internal::NONBLOCKING::numThreads();
+
+	const bool manual_choice =
+		grb::internal::NONBLOCKING::isManualTileSize();
+
+	if ( !manual_choice ) {
+		// It automatically determines the tile size and the number of threads.
+		// A different tile size and number of threads may be used for the
+		// execution of different pipelines.
+		constexpr size_t l1_cache_size = grb::config::MEMORY::l1_cache_size();
+		constexpr size_t min_tile_size =
+			grb::config::ANALYTIC_MODEL::MIN_TILE_SIZE;
+		constexpr double l1_cache_usage_percentage =
+			grb::config::ANALYTIC_MODEL::L1_CACHE_USAGE_PERCENTAGE;
+
+		// A tile size estimation based on the data that fit in L1 cache.
+		const size_t cache_based_tile_size =
+			( l1_cache_size / size_of_data_type ) / ( num_accessed_vectors );
+		// A tile size estimation based on the number of cores that can be utilized.
+		const size_t cores_based_tile_size = size_of_vector / num_threads;
+
+		// It selects the minimum tile size between the two estimations.
+		tile_size_estimation = ( cache_based_tile_size < cores_based_tile_size )
+			? cache_based_tile_size
+			: cores_based_tile_size;
+
+		// It assumes that the L1 cache may contain other data and takes the size of
+		// these data into account.
+		tile_size_estimation *= l1_cache_usage_percentage;
+		// It ensures that the tile size is sufficiently large to successfully
+		// amortize the runtime overhead.
+		tile_size_estimation = ( min_tile_size > tile_size_estimation )
+			? min_tile_size
+			: tile_size_estimation;
+	} else {
+		// It determines the tile size and the number of threads manually.
+		// A fixed tile size and number of threads is used for the execution of all
+		// pipelines.
+		tile_size_estimation =
+			grb::internal::NONBLOCKING::isManualTileSize();
+	}
+
+	// It ensures that the tile size does not exceed the size of vectors.
+	if( tile_size_estimation > size_of_vector ) {
+		tile_size = size_of_vector;
+	} else {
+		tile_size = tile_size_estimation;
+	}
+
+	// It computes the total number of tiles.
+	num_tiles = ( size_of_vector + tile_size - 1 ) / tile_size;
+
+	if ( !manual_choice ) {
+		// It adjusts the number of threads when there are not enough tiles to
+		// utilize all cores.
+		num_threads = ( num_threads < num_tiles ) ? num_threads : num_tiles;
+	}
+}
+
+size_t AnalyticModel::getVectorsSize() const noexcept {
+	return size_of_vector;
+}
+
+size_t AnalyticModel::getNumThreads() const noexcept {
+	return num_threads;
+}
+
+size_t AnalyticModel::getTileSize() const noexcept {
+	return tile_size;
+}
+
+size_t AnalyticModel::getNumTiles() const noexcept {
+	return num_tiles;
+}
+
diff --git a/src/graphblas/nonblocking/init.cpp b/src/graphblas/nonblocking/init.cpp
new file mode 100644
index 000000000..b2f786983
--- /dev/null
+++ b/src/graphblas/nonblocking/init.cpp
@@ -0,0 +1,90 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides initialisation for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#include <graphblas/reference/init.hpp>
+#include <graphblas/nonblocking/init.hpp>
+
+#include <graphblas/utils/alloc.hpp>
+
+#include <graphblas/nonblocking/config.hpp>
+
+#include <sstream>
+
+
+bool grb::internal::NONBLOCKING::warn_if_not_native = true;
+bool grb::internal::NONBLOCKING::manual_tile_size = false;
+size_t grb::internal::NONBLOCKING::manual_fixed_tile_size =
+	grb::config::ANALYTIC_MODEL::MIN_TILE_SIZE;
+size_t grb::internal::NONBLOCKING::num_threads = grb::config::OMP::threads();
+
+template<>
+grb::RC grb::init< grb::nonblocking >(
+	const size_t s, const size_t P, void * const data
+) {
+	// It initializes the maximum number of threads used by the analytic model.
+	internal::NONBLOCKING::num_threads = config::OMP::threads();
+
+	// If the environment variable GRB_NONBLOCKING_TILE_SIZE is set, a fixed
+	// tile size is used for all pipelines built during the nonblocking execution.
+	// Therefore, the choice is manual. Otherwise, the choice is automatically
+	// made at run-time by the analytic model and may differ for different
+	// pipelines.
+	const char *t = getenv( "GRB_NONBLOCKING_TILE_SIZE" );
+	if( t != nullptr ) {
+		grb::internal::NONBLOCKING::manual_tile_size = true;
+		try {
+			std::stringstream cppstr( t );
+			cppstr >> grb::internal::NONBLOCKING::manual_fixed_tile_size;
+		} catch( ... ) {
+			std::cerr << "Warning: could not parse contents of the "
+				<< "GRB_NONBLOCKING_TILE_SIZE environment variable; ignoring it instead.\n";
+			grb::internal::NONBLOCKING::manual_tile_size = false;
+		}
+	} else {
+		grb::internal::NONBLOCKING::manual_tile_size = false;
+	}
+
+	std::cerr << "Info: grb::init (nonblocking) called. OpenMP is set to utilise "
+		<< grb::internal::NONBLOCKING::num_threads << " threads and the tile size "
+		<< "for nonblocking execution is chosen " << (
+				grb::internal::NONBLOCKING::manual_tile_size
+					? "manually"
+					: "automatically.\n"
+			);
+	if( grb::internal::NONBLOCKING::manual_tile_size ) {
+		std::cerr << " and is equal to "
+			<< grb::internal::NONBLOCKING::manual_fixed_tile_size << "." << std::endl;
+	}
+
+	return grb::init< grb::reference >( s, P, data );
+}
+
+template<>
+grb::RC grb::finalize< grb::nonblocking >() {
+	std::cerr << "Info: grb::finalize (nonblocking) called.\n";
+	return grb::finalize< grb::reference >();
+}
+
diff --git a/src/graphblas/nonblocking/io.cpp b/src/graphblas/nonblocking/io.cpp
new file mode 100644
index 000000000..4c7e86885
--- /dev/null
+++ b/src/graphblas/nonblocking/io.cpp
@@ -0,0 +1,51 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the grb::wait for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#include <graphblas.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+
+namespace grb {
+
+	namespace internal {
+
+		extern LazyEvaluation le;
+
+	}
+
+	/**
+	 * \internal This is a nonblocking implementation, and all
+	 * pending operations must be completed.
+	 */
+	template<>
+	RC wait< nonblocking >() {
+
+		return internal::le.execution();
+	}
+
+}
+
diff --git a/src/graphblas/nonblocking/lazy_evaluation.cpp b/src/graphblas/nonblocking/lazy_evaluation.cpp
new file mode 100644
index 000000000..a78de8933
--- /dev/null
+++ b/src/graphblas/nonblocking/lazy_evaluation.cpp
@@ -0,0 +1,552 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements lazy evaluation.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#include <graphblas/config.hpp>
+#include <graphblas/backends.hpp>
+
+#include <graphblas/nonblocking/lazy_evaluation.hpp>
+
+
+using namespace grb::internal;
+
+namespace grb {
+
+	namespace internal {
+
+		LazyEvaluation le;
+	}
+
+}
+
+LazyEvaluation::LazyEvaluation() : warn_if_exceeded( true ) {
+	// 32 elements should be sufficient to avoid dynamic memory allocation for the
+	// pipelines built at run-time
+	pipelines.resize( config::PIPELINE::max_pipelines );
+	shared_data_pipelines.resize( config::PIPELINE::max_pipelines );
+}
+
+void LazyEvaluation::checkIfExceeded() noexcept {
+	if( warn_if_exceeded && config::PIPELINE::warn_if_exceeded ) {
+		if( pipelines.size() > config::PIPELINE::max_pipelines ) {
+			std::cerr << "Warning: the number of pipelines has exceeded the configured "
+				<< "initial capacity.\n";
+		}
+		warn_if_exceeded = false;
+	}
+}
+
+grb::RC LazyEvaluation::addStage(
+	const Pipeline::stage_type &&func, Opcode opcode,
+	const size_t n, const size_t data_type_size,
+	const bool dense_descr, const bool dense_mask,
+	void * const output_vector_ptr, void * const output_aux_vector_ptr,
+	Coordinates< nonblocking > * const coor_output_ptr,
+	Coordinates< nonblocking > * const coor_output_aux_ptr,
+	const void * const input_a_ptr, const void * const input_b_ptr,
+	const void * const input_c_ptr, const void * const input_d_ptr,
+	const Coordinates< nonblocking > * const coor_a_ptr,
+	const Coordinates< nonblocking > * const coor_b_ptr,
+	const Coordinates< nonblocking > * const coor_c_ptr,
+	const Coordinates< nonblocking > * const coor_d_ptr,
+	const void * const input_matrix
+) {
+	RC ret = SUCCESS;
+
+	// ensure that nothing is left from previous stages
+	shared_data_pipelines.clear();
+
+	if( opcode == Opcode::BLAS2_VXM_GENERIC ) {
+		// one output, one input, and maybe another two inputs
+		// TODO: the matrix is not currently added as input and thus not taken into
+		//       account for data dependence analysis
+
+		// search for pipelines with shared data
+		for(
+			std::vector< Pipeline >::iterator pt = pipelines.begin();
+			pt != pipelines.end(); pt++
+		) {
+
+			if( ( *pt ).empty() ) {
+				continue;
+			}
+
+			bool shared_data_found = false;
+			bool pipeline_executed = false;
+
+			if( (*pt).accessesInputVector( output_vector_ptr ) ) {
+				if( ( *pt ).overwritesVXMInputVectors( output_vector_ptr ) ) {
+					ret = ret ? ret : ( *pt ).execution();
+					pipeline_executed = true;
+				} else {
+					shared_data_found = true;
+				}
+			} else if( (*pt).accessesOutputVector( output_vector_ptr ) ) {
+				shared_data_found = true;
+			}
+
+			// it doesn't matter if any shared data found already
+			// it's still possibe that the pipeline has to be executed to avoid
+			// overwriting the input vectors of SpMV
+			if( !pipeline_executed ) {
+
+				// first we check for shared data with the write-access vectors for
+				// efficiency and only later we check for read-only vectors that don't
+				// enforce pipeline execution
+				if( ( *pt ).accessesOutputVector( input_a_ptr ) ) {
+					ret = ret ? ret : ( *pt ).execution();
+					pipeline_executed = true;
+				} else if( !shared_data_found &&
+					( *pt ).accessesInputVector( input_a_ptr )
+				) {
+					shared_data_found = true;
+				}
+
+				if( !pipeline_executed ) {
+					if( input_b_ptr != nullptr ) {
+						if( ( *pt ).accessesOutputVector( input_b_ptr ) ) {
+							ret = ret ? ret : ( *pt ).execution();
+							pipeline_executed = true;
+						} else if( !shared_data_found &&
+							( *pt ).accessesInputVector( input_b_ptr )
+						) {
+							shared_data_found = true;
+						}
+					}
+
+					if( !pipeline_executed ) {
+						if( input_c_ptr != nullptr ) {
+							if( ( *pt ).accessesOutputVector( input_c_ptr ) ) {
+								ret = ret ? ret : ( *pt ).execution();
+								pipeline_executed = true;
+							} else if( !shared_data_found &&
+								( *pt ).accessesInputVector( input_c_ptr )
+							) {
+								shared_data_found = true;
+							}
+						}
+					}
+				}
+			}
+
+			if( !pipeline_executed && shared_data_found ) {
+				shared_data_pipelines.push_back( pt );
+			}
+		}
+	}  else {
+		for(
+			std::vector< Pipeline >::iterator pt = pipelines.begin();
+			pt != pipelines.end(); pt++
+		) {
+
+			if( ( *pt ).empty() ) {
+				continue;
+			}
+
+			bool shared_data_found = false;
+			bool pipeline_executed = false;
+
+			if( output_vector_ptr != nullptr ) {
+				if( (*pt).accessesInputVector( output_vector_ptr ) ) {
+					if( ( *pt ).overwritesVXMInputVectors( output_vector_ptr ) ) {
+						ret = ret ? ret : ( *pt ).execution();
+						pipeline_executed = true;
+					} else {
+						shared_data_found = true;
+					}
+				} else if( (*pt).accessesOutputVector( output_vector_ptr ) ) {
+					shared_data_found = true;
+				}
+			}
+
+			if( !pipeline_executed ) {
+
+				if( opcode == Opcode::BLAS1_UNZIP ) {
+					// it doesn't matter if have already found shared data
+					// it's still necessary to execute the pipeline if the second output of
+					// unzip overwrites any of the the input vectors of SpMV
+
+					// check the second output
+					if( (*pt).accessesInputVector( output_aux_vector_ptr ) ) {
+						if( ( *pt ).overwritesVXMInputVectors( output_aux_vector_ptr ) ) {
+							ret = ret ? ret : ( *pt ).execution();
+							pipeline_executed = true;
+						} else {
+							shared_data_found = true;
+						}
+					} else if( (*pt).accessesOutputVector( output_aux_vector_ptr ) ) {
+						shared_data_found = true;
+					}
+				}
+
+				if( !pipeline_executed ) {
+					if( !shared_data_found ) {
+						if( ( input_a_ptr != nullptr && (*pt).accessesVector( input_a_ptr ) ) ||
+							( input_b_ptr != nullptr && (*pt).accessesVector( input_b_ptr ) ) ||
+							( input_c_ptr != nullptr && (*pt).accessesVector( input_c_ptr ) ) ||
+							( input_d_ptr != nullptr && (*pt).accessesVector( input_d_ptr ) )
+						) {
+							shared_data_found = true;
+						}
+					}
+
+					if( shared_data_found ) {
+						shared_data_pipelines.push_back( pt );
+					}
+				}
+			}
+		}
+	}
+
+#ifdef _DEBUG
+	if( !(
+		opcode == Opcode::IO_SET_SCALAR || opcode == Opcode::IO_SET_MASKED_SCALAR ||
+		opcode == Opcode::IO_SET_VECTOR || opcode == Opcode::IO_SET_MASKED_VECTOR ||
+		opcode == Opcode::BLAS1_FOLD_VECTOR_SCALAR_GENERIC ||
+		opcode == Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC ||
+		opcode == Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC ||
+		opcode == Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC ||
+		opcode == Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC ||
+		opcode == Opcode::BLAS1_EWISEAPPLY ||
+		opcode == Opcode::BLAS1_MASKED_EWISEAPPLY ||
+		opcode == Opcode::BLAS1_EWISEMULADD_DISPATCH ||
+		opcode == Opcode::BLAS1_DOT_GENERIC ||
+		opcode == Opcode::BLAS1_EWISELAMBDA || opcode == Opcode::BLAS1_EWISEMAP ||
+		opcode == Opcode::BLAS1_ZIP || opcode == Opcode::BLAS1_UNZIP ||
+		opcode == Opcode::BLAS2_VXM_GENERIC
+	) ) {
+		std::cerr << "error:Data Dependence Analysis has not been implemented for "
+			<< "the operation with code " << static_cast< unsigned int >( opcode )
+			<< std::endl;
+		exit( 1 );
+	}
+
+	for(
+		std::vector< std::vector< Pipeline >::iterator >::iterator st =
+			shared_data_pipelines.begin();
+		st != shared_data_pipelines.end(); st++
+	) {
+		if( (*(*st)).getContainersSize() != n ) {
+			std::cerr << "error:Data Dependence Analysis detected data-dependent "
+				<< "operations on vectors of different size" << std::endl;
+			exit( 1 );
+		}
+	}
+#endif
+
+	// after executing all the pipelines with which the current stage shares data
+	// and these data dependences do not allow this stage to be inserted into such
+	// a pipeline we know that we can now merge all the remaining pipelines with
+	// which the current stage shares data an then add the current stage at the end
+	// of the new pipeline for efficiency, we consider the three following cases
+	if( shared_data_pipelines.empty() ) {
+		// if none of the current pipelines shares any data, the stage is added in a
+		// new pipeline
+
+		Pipeline *empty_pipeline = nullptr;
+
+		for(
+			std::vector< Pipeline >::iterator pt = pipelines.begin();
+			pt != pipelines.end(); pt++
+		) {
+
+			if( ( *pt ).empty() ) {
+				empty_pipeline = &( *pt );
+				break;
+			}
+		}
+
+		if( empty_pipeline != nullptr ) {
+			( *empty_pipeline).addStage(
+				std::move( func ), opcode,
+				n, data_type_size, dense_descr, dense_mask,
+				output_vector_ptr, output_aux_vector_ptr,
+				coor_output_ptr, coor_output_aux_ptr,
+				input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
+				coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+				input_matrix
+			);
+
+			// we always execute the pipeline when a scalar is returned
+			if( output_vector_ptr == nullptr ) {
+				ret = ret ? ret : ( *empty_pipeline ).execution();
+			}
+		} else {
+			Pipeline pipeline;
+
+			pipeline.addStage(
+				std::move( func ), opcode,
+				n, data_type_size, dense_descr, dense_mask,
+				output_vector_ptr, output_aux_vector_ptr,
+				coor_output_ptr, coor_output_aux_ptr,
+				input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
+				coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+				input_matrix
+			);
+
+			// we always execute the pipeline when a scalar is returned
+			if( output_vector_ptr == nullptr ) {
+				ret = ret ? ret : pipeline.execution();
+			} else {
+				pipelines.push_back( std::move( pipeline ) );
+				// pipelines.emplace_back( Pipeline() );
+			}
+		}
+	} else if ( shared_data_pipelines.size() == 1 ) {
+
+		std::vector< Pipeline >::iterator ptr = ( *(shared_data_pipelines.begin()) );
+
+		// the stage is added in the current pipeline which may be empty if it
+		// overwrites the input of SpMV
+		// it is not necessary to deallocate/release this pipeline
+		( *ptr ).addStage(
+			std::move( func ), opcode,
+			n, data_type_size, dense_descr, dense_mask,
+			output_vector_ptr, output_aux_vector_ptr,
+			coor_output_ptr, coor_output_aux_ptr,
+			input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
+			coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+			input_matrix
+		);
+
+		// we always execute the pipeline when a scalar is returned
+		if( output_vector_ptr == nullptr ) {
+			ret = ret ? ret : ( *ptr ).execution();
+		}
+	} else {
+
+		// all pipelines with which the current pipelines shares data will be merged
+		// under the first pipeline
+		std::vector< Pipeline >::iterator union_pipeline =
+			( *(shared_data_pipelines.begin()) );
+
+		for(
+			std::vector< std::vector< Pipeline >::iterator >::iterator st =
+				++shared_data_pipelines.begin();
+			st != shared_data_pipelines.end(); st++
+		) {
+			( *union_pipeline ).merge( *( *st ) );
+		}
+
+		// the stage is added in the merged pipeline
+		// it is not necessary to deallocate/release this pipeline
+		( *union_pipeline ).addStage(
+			std::move( func ), opcode,
+			n, data_type_size, dense_descr, dense_mask,
+			output_vector_ptr, output_aux_vector_ptr,
+			coor_output_ptr, coor_output_aux_ptr,
+			input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr,
+			coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr,
+			input_matrix
+		);
+
+		// we always execute the pipeline when a scalar is returned
+		if( output_vector_ptr == nullptr ) {
+			ret = ret ? ret : ( *union_pipeline ).execution();
+		}
+	}
+
+	checkIfExceeded();
+
+	return ret;
+}
+
+grb::RC LazyEvaluation::addeWiseLambdaStage(
+	const Pipeline::stage_type &&func, Opcode opcode,
+	const size_t n, const size_t data_type_size,
+	const bool dense_descr,
+	std::vector< const void * > all_vectors_ptr,
+	const Coordinates< nonblocking > * const coor_a_ptr
+) {
+	RC ret = SUCCESS;
+
+	// ensure that nothing is left from previous stages
+	shared_data_pipelines.clear();
+
+	for(
+		std::vector< Pipeline >::iterator pt = pipelines.begin();
+		pt != pipelines.end(); pt++
+	) {
+		if( ( *pt ).empty() ) {
+			continue;
+		}
+
+		// processes all output vectors of eWiseLambda
+		for(
+			std::vector< const void *>::iterator it = all_vectors_ptr.begin();
+			it != all_vectors_ptr.end(); ++it
+		) {
+			if( (*pt).accessesInputVector( *it ) ) {
+				if( ( *pt ).overwritesVXMInputVectors( *it ) ) {
+					( *pt ).execution();
+				} else {
+				shared_data_pipelines.push_back( pt );
+				}
+			} else if( (*pt).accessesOutputVector( *it ) ) {
+				shared_data_pipelines.push_back( pt );
+			}
+		}
+	}
+
+#ifdef _DEBUG
+	for(
+		std::vector< std::vector< Pipeline >::iterator >::iterator st =
+			shared_data_pipelines.begin();
+		st != shared_data_pipelines.end(); st++
+	) {
+		if( (*(*st)).getContainersSize() != n ) {
+			std::cerr << "error:Data Dependence Analysis detected data-dependent "
+				<< "operations on vectors of different size" << std::endl;
+			exit( 1 );
+		}
+	}
+#endif
+
+	if( shared_data_pipelines.empty() ) {
+		// if none of the current pipelines shares any data, the stage is added in a
+		// new pipeline
+
+		Pipeline *empty_pipeline = nullptr;
+
+		for(
+			std::vector< Pipeline >::iterator pt = pipelines.begin();
+			pt != pipelines.end(); pt++
+		) {
+			if( ( *pt ).empty() ) {
+				empty_pipeline = &( *pt );
+				break;
+			}
+		}
+
+		if( empty_pipeline != nullptr ) {
+			(*empty_pipeline).addeWiseLambdaStage(
+				std::move( func ), opcode,
+				n, data_type_size,
+				dense_descr,
+				all_vectors_ptr, coor_a_ptr
+			);
+		} else {
+			Pipeline pipeline;
+			pipeline.addeWiseLambdaStage(
+				std::move( func ), opcode,
+				n, data_type_size,
+				dense_descr,
+				all_vectors_ptr, coor_a_ptr
+			);
+			pipelines.push_back( std::move( pipeline ) );
+			// pipelines.emplace_back( Pipeline() );
+		}
+	} else if ( shared_data_pipelines.size() == 1 ) {
+
+		std::vector< Pipeline >::iterator ptr =
+			( *(shared_data_pipelines.begin()) );
+
+		// the stage is added in the current pipeline which may be empty if it
+		// overwrites the input of SpMV
+		// it is not necessary to deallocate/release this pipeline
+		( *ptr ).addeWiseLambdaStage(
+			std::move( func ), opcode,
+			n, data_type_size,
+			dense_descr,
+			all_vectors_ptr, coor_a_ptr
+		);
+	} else {
+
+		// all pipelines with which the current pipelines shares data will be merged
+		// under the first pipeline
+		std::vector< Pipeline >::iterator union_pipeline =
+			( *(shared_data_pipelines.begin()) );
+
+		for(
+			std::vector< std::vector< Pipeline >::iterator >::iterator st =
+				++shared_data_pipelines.begin();
+			st != shared_data_pipelines.end(); st++
+		) {
+			( *union_pipeline ).merge( *( *st ) );
+		}
+
+		// the stage is added in the merged pipeline
+		// it is not necessary to deallocate/release this pipeline
+		( *union_pipeline ).addeWiseLambdaStage(
+			std::move( func ), opcode,
+			n, data_type_size,
+			dense_descr,
+			all_vectors_ptr, coor_a_ptr
+		);
+	}
+
+	checkIfExceeded();
+
+	return ret;
+}
+
+grb::RC LazyEvaluation::execution( const void * const container )
+{
+	RC rc = SUCCESS;
+
+	// search for pipelines with shared data
+	for(
+		std::vector< Pipeline >::iterator pt = pipelines.begin();
+		pt != pipelines.end(); pt++
+	) {
+
+		if( ( *pt ).empty() ) {
+			continue;
+		}
+
+		// a single pipeline is executed, and in the case of returning an error, it
+		// is handled correctly
+		if( (*pt).accessesVector( container ) || (*pt).accessesMatrix( container ) ) {
+			rc = (*pt).execution();
+			break;
+		}
+	}
+
+	return rc;
+}
+
+grb::RC LazyEvaluation::execution()
+{
+	RC rc = SUCCESS;
+
+	// execute all pipelines
+	for(
+		std::vector< Pipeline >::iterator pt = pipelines.begin();
+		pt != pipelines.end(); pt++
+	) {
+
+		if( ( *pt ).empty() ) {
+			continue;
+		}
+
+		rc = (*pt).execution();
+		if( rc != SUCCESS ) {
+			return rc;
+		}
+	}
+
+	return rc;
+}
+
diff --git a/src/graphblas/nonblocking/pipeline.cpp b/src/graphblas/nonblocking/pipeline.cpp
new file mode 100644
index 000000000..73e5d7643
--- /dev/null
+++ b/src/graphblas/nonblocking/pipeline.cpp
@@ -0,0 +1,1042 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides a pipeline for nonblocking execution.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#include <graphblas/config.hpp>
+#include <graphblas/backends.hpp>
+
+#include <graphblas/nonblocking/pipeline.hpp>
+#include <graphblas/nonblocking/analytic_model.hpp>
+
+
+using namespace grb::internal;
+
+Pipeline::Pipeline() {
+	constexpr const size_t initial_container_cap =
+		config::PIPELINE::max_containers;
+	constexpr const size_t initial_stage_cap = config::PIPELINE::max_depth;
+	constexpr const size_t initial_tile_cap = config::PIPELINE::max_tiles;
+
+	// an initially empty pipeline does not contain any primitive
+	contains_out_of_place_primitive = false;
+
+	// the value 0 for the size of containers indicates
+	// either an empty pipeline or a pipeline of empty containers
+	containers_size = 0;
+	size_of_data_type = 0;
+
+	// reserve sufficient memory to avoid dynamic memory allocation at run-time
+	stages.reserve( initial_stage_cap );
+	opcodes.reserve( initial_stage_cap );
+	lower_bound.reserve( initial_tile_cap );
+	upper_bound.reserve( initial_tile_cap );
+	input_output_intersection.reserve( initial_container_cap );
+
+	// the below looped-insert-then-clear simulates a reserve and can be reasonably
+	// expected to work for an optimised STL implementation. However, it would be
+	// nicer if we had our own set container that supports a guaranteed reserve(),
+	// as this simulation might not always work as expected.
+	for( size_t i = 0; i < initial_container_cap; ++i ) {
+		void * const dummy = reinterpret_cast< void * >( i );
+		Coordinates< nonblocking > * const dumCoor =
+			reinterpret_cast< Coordinates< nonblocking > * >( i );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+		const Coordinates< nonblocking > * const dumCCoor =
+			reinterpret_cast< const Coordinates< nonblocking > * >( i );
+#endif
+		accessed_coordinates.insert( dumCoor );
+		input_vectors.insert( dummy );
+		output_vectors.insert( dummy );
+		vxm_input_vectors.insert( dummy );
+		input_matrices.insert( dummy );
+		out_of_place_output_coordinates.insert( dumCoor );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+		already_dense_coordinates.insert( dumCCoor );
+#endif
+		dense_descr_coordinates.insert( dumCoor );
+	}
+	accessed_coordinates.clear();
+	input_vectors.clear();
+	output_vectors.clear();
+	vxm_input_vectors.clear();
+	input_matrices.clear();
+	out_of_place_output_coordinates.clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates.clear();
+#endif
+	dense_descr_coordinates.clear();
+
+	no_warning_emitted_yet = true;
+}
+
+Pipeline::Pipeline( const Pipeline &pipeline ) :
+	stages( pipeline.stages ), opcodes( pipeline.opcodes),
+	accessed_coordinates( pipeline.accessed_coordinates ),
+	input_vectors( pipeline.input_vectors ),
+	output_vectors( pipeline.output_vectors ),
+	vxm_input_vectors( pipeline.vxm_input_vectors ),
+	input_matrices( pipeline.input_matrices ),
+	out_of_place_output_coordinates( pipeline.out_of_place_output_coordinates ),
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates( pipeline.already_dense_coordinates ),
+#endif
+	dense_descr_coordinates( pipeline.dense_descr_coordinates ),
+	no_warning_emitted_yet( pipeline.no_warning_emitted_yet )
+{
+	contains_out_of_place_primitive = pipeline.contains_out_of_place_primitive;
+	containers_size = pipeline.containers_size;
+	size_of_data_type = pipeline.size_of_data_type;
+
+	// no action is requred regarding the input_output_intersection that is only
+	// used temporarily during the pipeline execution
+}
+
+Pipeline::Pipeline( Pipeline &&pipeline ) noexcept :
+	stages( std::move( pipeline.stages ) ),
+	opcodes( std::move( pipeline.opcodes ) ),
+	accessed_coordinates( std::move( pipeline.accessed_coordinates ) ),
+	input_vectors( std::move( pipeline.input_vectors ) ),
+	output_vectors( std::move( pipeline.output_vectors ) ),
+	vxm_input_vectors( std::move( pipeline.vxm_input_vectors ) ),
+	input_matrices( std::move( pipeline.input_matrices ) ),
+	out_of_place_output_coordinates(
+		std::move( pipeline.out_of_place_output_coordinates ) ),
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates( std::move( pipeline.already_dense_coordinates ) ),
+#endif
+	dense_descr_coordinates( std::move( pipeline.dense_descr_coordinates ) ),
+	no_warning_emitted_yet( pipeline.no_warning_emitted_yet )
+{
+	contains_out_of_place_primitive = pipeline.contains_out_of_place_primitive;
+	containers_size = pipeline.containers_size;
+	size_of_data_type = pipeline.size_of_data_type;
+
+	// no action is requred regarding the input_output_intersection that is only
+	// used temporarily during the pipeline execution
+
+	pipeline.contains_out_of_place_primitive = false;
+	pipeline.containers_size = 0;
+	pipeline.size_of_data_type = 0;
+}
+
+Pipeline &Pipeline::operator=( const Pipeline &pipeline ) {
+	contains_out_of_place_primitive = pipeline.contains_out_of_place_primitive;
+	containers_size = pipeline.containers_size;
+	size_of_data_type = pipeline.size_of_data_type;
+
+	stages = pipeline.stages;
+	opcodes = pipeline.opcodes;
+	accessed_coordinates = pipeline.accessed_coordinates;
+	input_vectors = pipeline.input_vectors;
+	output_vectors = pipeline.output_vectors;
+	vxm_input_vectors = pipeline.vxm_input_vectors;
+	input_matrices = pipeline.input_matrices;
+	out_of_place_output_coordinates = pipeline.out_of_place_output_coordinates;
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates = pipeline.already_dense_coordinates;
+#endif
+	dense_descr_coordinates = pipeline.dense_descr_coordinates;
+	no_warning_emitted_yet = pipeline.no_warning_emitted_yet;
+
+	// no action is requred regarding the input_output_intersection that is only
+	// used temporarily during the pipeline execution
+
+	return *this;
+}
+
+Pipeline &Pipeline::operator=( Pipeline &&pipeline ) {
+	contains_out_of_place_primitive = pipeline.contains_out_of_place_primitive;
+	containers_size = pipeline.containers_size;
+	size_of_data_type = pipeline.size_of_data_type;
+
+	stages = std::move( pipeline.stages );
+	opcodes = std::move( pipeline.opcodes );
+
+	accessed_coordinates = std::move( pipeline.accessed_coordinates );
+	input_vectors = std::move( pipeline.input_vectors );
+	output_vectors = std::move( pipeline.output_vectors );
+	vxm_input_vectors = std::move( pipeline.vxm_input_vectors );
+	input_matrices = std::move( pipeline.input_matrices );
+	out_of_place_output_coordinates =
+		std::move( pipeline.out_of_place_output_coordinates );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates = std::move( pipeline.already_dense_coordinates );
+#endif
+	dense_descr_coordinates = std::move( pipeline.dense_descr_coordinates );
+	no_warning_emitted_yet = pipeline.no_warning_emitted_yet;
+
+	// no action is requred regarding the input_output_intersection that is only
+	// used temporarily during the pipeline execution
+
+	pipeline.contains_out_of_place_primitive = false;
+	pipeline.containers_size = 0;
+	pipeline.size_of_data_type = 0;
+
+	return *this;
+}
+
+void Pipeline::warnIfExceeded() {
+	if( no_warning_emitted_yet && config::PIPELINE::warn_if_exceeded ) {
+		if( stages.size() > config::PIPELINE::max_depth ||
+			opcodes.size() > config::PIPELINE::max_depth
+		) {
+			std::cerr << "Warning: the number of pipeline stages has been increased "
+				<< "past the initial reserved number of stages\n";
+		}
+		if( accessed_coordinates.size() > config::PIPELINE::max_containers ||
+			input_vectors.size() > config::PIPELINE::max_containers ||
+			output_vectors.size() > config::PIPELINE::max_containers ||
+			vxm_input_vectors.size() > config::PIPELINE::max_containers ||
+			input_matrices.size() > config::PIPELINE::max_containers ||
+			out_of_place_output_coordinates.size() > config::PIPELINE::max_containers ||
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			already_dense_coordinates.size() > config::PIPELINE::max_containers ||
+#endif
+			dense_descr_coordinates.size() > config::PIPELINE::max_containers
+		) {
+			std::cerr << "Warning: the number of pipeline containers has increased past "
+				<< "the initial number of reserved containers.\n";
+		}
+		if( lower_bound.size() > config::PIPELINE::max_tiles ||
+			upper_bound.size() > config::PIPELINE::max_tiles ||
+			input_output_intersection.size() > config::PIPELINE::max_tiles
+		) {
+			std::cerr << "Warning: the number of pipeline tiles has increased past the "
+				<< "initial number of reserved tiles.\n";
+		}
+		no_warning_emitted_yet = false;
+	}
+}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+bool Pipeline::allAlreadyDenseVectors() const {
+	return all_already_dense_vectors;
+}
+#endif
+
+bool Pipeline::empty() const {
+	return stages.empty();
+}
+
+typename std::vector< Pipeline::stage_type >::iterator Pipeline::pbegin() {
+	return stages.begin();
+}
+
+typename std::vector< Pipeline::stage_type >::iterator Pipeline::pend() {
+	return stages.end();
+}
+
+typename std::set< Coordinates< grb::nonblocking > * >::iterator
+Pipeline::vbegin() {
+	return accessed_coordinates.begin();
+}
+
+typename std::set< Coordinates< grb::nonblocking > * >::iterator
+Pipeline::vend() {
+	return accessed_coordinates.end();
+}
+
+size_t Pipeline::accessedCoordinatesSize() const {
+	return accessed_coordinates.size();
+}
+
+size_t Pipeline::getNumStages() const {
+	return stages.size();
+}
+
+size_t Pipeline::getContainersSize() const {
+	return containers_size;
+}
+
+void Pipeline::addStage(
+		const Pipeline::stage_type &&func, const Opcode opcode,
+		const size_t n, const size_t data_type_size,
+		const bool dense_descr, const bool dense_mask,
+		void * const output_vector_ptr, void * const output_aux_vector_ptr,
+		Coordinates< nonblocking > * const coor_output_ptr,
+		Coordinates< nonblocking > * const coor_output_aux_ptr,
+		const void * const input_a_ptr, const void * const input_b_ptr,
+		const void * const input_c_ptr, const void * const input_d_ptr,
+		const Coordinates< nonblocking > * const coor_a_ptr,
+		const Coordinates< nonblocking > * const coor_b_ptr,
+		const Coordinates< nonblocking > * const coor_c_ptr,
+		const Coordinates< nonblocking > * const coor_d_ptr,
+		const void * const input_matrix
+) {
+	assert( stages.size() != 0 || containers_size == 0);
+
+	if( stages.size() == 0 ) {
+		containers_size = n;
+	}
+
+	// the size of containers and the data type should match
+	assert( containers_size == n );
+
+	//TODO (internal issue 617): does the size of data matches for all containers?
+
+	// pipelines may consist of primitives that operate on different data types,
+	// e.g., double and bool the analytic model should take into account the
+	// different data types and make a proper estimation an easy and perhaps
+	// temporary fix is to use the maximum size of the data types involved in a
+	// pipeline
+	if( data_type_size > size_of_data_type ) {
+		size_of_data_type = data_type_size;
+	}
+
+	stages.push_back( std::move( func ) );
+	opcodes.push_back( opcode );
+
+	if( output_vector_ptr != nullptr ) {
+		output_vectors.insert( output_vector_ptr );
+	}
+
+	if( output_aux_vector_ptr != nullptr ) {
+		output_vectors.insert( output_aux_vector_ptr );
+	}
+
+	// special treatment for an SpMV operation as the input must not be overwritten
+	// by another stage of the pipeline
+	if( opcode == Opcode::BLAS2_VXM_GENERIC ) {
+
+		if( input_a_ptr != nullptr ) {
+			input_vectors.insert( input_a_ptr );
+			vxm_input_vectors.insert( input_a_ptr );
+		}
+
+		if( input_b_ptr != nullptr ) {
+			input_vectors.insert( input_b_ptr );
+			vxm_input_vectors.insert( input_b_ptr );
+		}
+
+		if( input_c_ptr != nullptr ) {
+			input_vectors.insert( input_c_ptr );
+			vxm_input_vectors.insert( input_c_ptr );
+		}
+
+		if( input_d_ptr != nullptr ) {
+			input_vectors.insert( input_d_ptr );
+			vxm_input_vectors.insert( input_d_ptr );
+		}
+
+		// in the current implementation that supports level-1 and level-2 operations
+		// a pointer to an input matrix may be passed only by an SpMV operation
+		// TODO once level-3 operations are supported, the following code should be
+		//      moved
+		if( input_matrix != nullptr ) {
+			input_matrices.insert( input_matrix );
+		}
+	} else {
+		if( input_a_ptr != nullptr ) {
+			input_vectors.insert( input_a_ptr );
+		}
+
+		if( input_b_ptr != nullptr ) {
+			input_vectors.insert( input_b_ptr );
+		}
+
+		if( input_c_ptr != nullptr ) {
+			input_vectors.insert( input_c_ptr );
+		}
+
+		if( input_d_ptr != nullptr ) {
+			input_vectors.insert( input_d_ptr );
+		}
+	}
+
+	// update all the sets of the pipeline by adding the entries of the new stage
+	if( coor_a_ptr != nullptr ) {
+		if( dense_descr ) {
+			dense_descr_coordinates.insert(
+				const_cast< Coordinates< nonblocking > * >( coor_a_ptr ) );
+		} else {
+			accessed_coordinates.insert(
+				const_cast< Coordinates< nonblocking > * >( coor_a_ptr ) );
+		}
+	}
+
+	if( coor_b_ptr != nullptr ) {
+		if( dense_descr ) {
+			dense_descr_coordinates.insert(
+				const_cast< internal::Coordinates< nonblocking > * >( coor_b_ptr ) );
+		} else {
+			accessed_coordinates.insert(
+				const_cast< internal::Coordinates< nonblocking > * >( coor_b_ptr ) );
+		}
+	}
+
+	if( coor_c_ptr != nullptr ) {
+		if( dense_descr ) {
+			dense_descr_coordinates.insert(
+				const_cast< internal::Coordinates<nonblocking > * >( coor_c_ptr ) );
+		} else {
+			accessed_coordinates.insert(
+				const_cast< internal::Coordinates< nonblocking > * >( coor_c_ptr ) );
+		}
+	}
+
+	if( coor_d_ptr != nullptr ) {
+		if( dense_descr ) {
+			dense_descr_coordinates.insert(
+				const_cast< internal::Coordinates< nonblocking > * >( coor_d_ptr ) );
+		} else {
+			accessed_coordinates.insert(
+				const_cast< internal::Coordinates< nonblocking > * >( coor_d_ptr ) );
+		}
+	}
+
+	// keep track of out-of-place operations that may make a dense vector sparse
+	// such operations disable potential optimizations for already dense vectors
+	if( opcode == Opcode::BLAS1_EWISEAPPLY ||
+		opcode == Opcode::BLAS1_MASKED_EWISEAPPLY ||
+		opcode == Opcode::IO_SET_MASKED_SCALAR ||
+		opcode == Opcode::IO_SET_VECTOR ||
+		opcode == Opcode::IO_SET_MASKED_VECTOR
+	) {
+		// the output of these specific primitives cannot be nullptr
+
+		if( dense_descr ) {
+			dense_descr_coordinates.insert( coor_output_ptr );
+		}
+
+		// when the dense descriptor is not provided or the operation is masked
+		// there is no guarantee that an already dense vector will remain dense
+		// therefore, the pipeline is marked to disable the already dense optimization
+		if( !dense_descr || ( !dense_mask && (
+			opcode == Opcode::BLAS1_MASKED_EWISEAPPLY ||
+			opcode == Opcode::IO_SET_MASKED_SCALAR ||
+			opcode == Opcode::IO_SET_MASKED_VECTOR
+		) ) ) {
+			contains_out_of_place_primitive = true;
+			out_of_place_output_coordinates.insert( coor_output_ptr );
+			accessed_coordinates.insert( coor_output_ptr );
+		}
+
+		// TODO: once UNZIP is complete
+		// the second output is always nullptr for the out-of-place primitives that
+		// are handled here
+		// however, once we have the complete implementation of unzip (which handles
+		// sparsity) then need to consider the second output here
+	} else {
+
+		// check the first output
+		if( coor_output_ptr != nullptr ) {
+			if( dense_descr ) {
+				dense_descr_coordinates.insert( coor_output_ptr );
+			} else {
+				accessed_coordinates.insert( coor_output_ptr );
+			}
+		}
+
+		// check the second output
+		if( coor_output_aux_ptr != nullptr ) {
+			if( dense_descr ) {
+				dense_descr_coordinates.insert( coor_output_aux_ptr );
+			} else {
+				accessed_coordinates.insert( coor_output_aux_ptr );
+			}
+		}
+	}
+
+	warnIfExceeded();
+}
+
+void Pipeline::addeWiseLambdaStage(
+	const Pipeline::stage_type &&func, const Opcode opcode,
+	const size_t n, const size_t data_type_size,
+	const bool dense_descr,
+	std::vector< const void * > all_vectors_ptr,
+	const Coordinates< nonblocking > * const coor_a_ptr
+) {
+	(void) data_type_size;
+
+	assert( stages.size() != 0 || containers_size == 0);
+
+	if( stages.size() == 0 ) {
+		containers_size = n;
+	}
+
+	// the analytic model takes into account the size of data used by an
+	// eWiseLambda primitive
+	if( data_type_size > size_of_data_type ) {
+		size_of_data_type = data_type_size;
+	}
+
+	assert( containers_size == n );
+
+	stages.push_back( std::move( func ) );
+	opcodes.push_back( opcode );
+
+	// add all vectors accessed by eWiseLambda as output vectors
+	for( std::vector< const void *>::iterator it =
+		all_vectors_ptr.begin(); it != all_vectors_ptr.end(); ++it
+	) {
+		output_vectors.insert( *it );
+	}
+
+	// add the coordinates for the single vector
+	if( coor_a_ptr != nullptr ) {
+		if( dense_descr ) {
+			dense_descr_coordinates.insert(
+				const_cast< Coordinates< nonblocking > * >( coor_a_ptr ) );
+		} else {
+			accessed_coordinates.insert(
+				const_cast< Coordinates< nonblocking > * >( coor_a_ptr ) );
+		}
+	}
+
+	warnIfExceeded();
+}
+
+bool Pipeline::accessesInputVector( const void * const vector ) const {
+	return input_vectors.find( vector ) != input_vectors.end();
+}
+
+bool Pipeline::accessesOutputVector( const void * const vector ) const {
+	return output_vectors.find( vector ) != output_vectors.end();
+}
+
+bool Pipeline::accessesVector( const void * const vector ) const {
+	return (input_vectors.find( vector ) != input_vectors.end()) ||
+		(output_vectors.find( vector ) != output_vectors.end());
+}
+
+bool Pipeline::accessesMatrix( const void * const matrix ) const {
+	return ( input_matrices.find( matrix ) != input_matrices.end() );
+}
+
+bool Pipeline::overwritesVXMInputVectors(
+	const void * const output_vector_ptr
+) const {
+	return vxm_input_vectors.find( output_vector_ptr ) != vxm_input_vectors.end();
+}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+bool Pipeline::emptyAlreadyDenseVectors() const {
+	return already_dense_coordinates.empty();
+}
+
+bool Pipeline::containsAlreadyDenseVector(
+	const Coordinates< nonblocking > * const vector_ptr
+) const {
+	return already_dense_coordinates.find( vector_ptr ) !=
+		already_dense_coordinates.end();
+}
+
+void Pipeline::markMaybeSparseVector(
+	const Coordinates< nonblocking > * const vector_ptr
+) {
+	// the vector should be marked sparse only if it has not already been marked
+	if( already_dense_coordinates.find( vector_ptr ) !=
+		already_dense_coordinates.end()
+	) {
+		// when this method is invoked by an out-of-place primitive
+		// disable a potentially enabled dense descriptor
+		all_already_dense_vectors = false;
+		// and remove the coordinates from the set
+		already_dense_coordinates.erase( vector_ptr );
+	}
+}
+#endif
+
+void Pipeline::markMaybeSparseDenseDescriptorVerification(
+	Coordinates< nonblocking > * const vector_ptr
+) {
+	if( dense_descr_coordinates.find( vector_ptr ) !=
+		dense_descr_coordinates.end()
+	) {
+		dense_descr_coordinates.erase( vector_ptr );
+	}
+}
+
+bool Pipeline::outOfPlaceOutput(
+	const internal::Coordinates< nonblocking > * const vector_ptr
+) {
+	if( out_of_place_output_coordinates.find( vector_ptr ) !=
+		out_of_place_output_coordinates.end()
+	) {
+		return true;
+	}
+
+	return false;
+}
+
+void Pipeline::merge( Pipeline &pipeline ) {
+	// if any of the pipelines contains an out-of-place primitive, the merged
+	// pipeline contains as well
+	if( pipeline.contains_out_of_place_primitive ) {
+		contains_out_of_place_primitive = true;
+	}
+
+	// the size of the data accessed in the pipeline is updated based on the
+	// maximum of the two merged pipelines
+	if( pipeline.size_of_data_type > size_of_data_type ) {
+		size_of_data_type = pipeline.size_of_data_type;
+	}
+
+	assert( containers_size == pipeline.containers_size );
+
+	// add all the stages into the pipeline by maintaining the relative order
+	for(
+		std::vector< stage_type >::iterator st = pipeline.stages.begin();
+		st != pipeline.stages.end(); st++
+	) {
+		stages.push_back( std::move( *st ) );
+	}
+
+	// add all the opcodes into the pipeline by maintaining the relative order
+	for(
+		std::vector< Opcode >::iterator ot = pipeline.opcodes.begin();
+		ot != pipeline.opcodes.end(); ot++
+	) {
+		opcodes.push_back( *ot );
+	}
+
+	// update all the sets of the pipeline by adding the entries of the new stage
+	accessed_coordinates.insert(
+		pipeline.accessed_coordinates.begin(), pipeline.accessed_coordinates.end() );
+
+	input_vectors.insert( pipeline.input_vectors.begin(), pipeline.input_vectors.end() );
+
+	output_vectors.insert(
+		pipeline.output_vectors.begin(), pipeline.output_vectors.end() );
+
+	vxm_input_vectors.insert(
+		pipeline.vxm_input_vectors.begin(), pipeline.vxm_input_vectors.end() );
+
+	input_matrices.insert(
+		pipeline.input_matrices.begin(), pipeline.input_matrices.end() );
+
+	out_of_place_output_coordinates.insert(
+		pipeline.out_of_place_output_coordinates.begin(),
+		pipeline.out_of_place_output_coordinates.end()
+	);
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates.insert( pipeline.already_dense_coordinates.begin(),
+		pipeline.already_dense_coordinates.end() );
+#endif
+	dense_descr_coordinates.insert( pipeline.dense_descr_coordinates.begin(),
+		pipeline.dense_descr_coordinates.end() );
+
+	// clear all the sets of the pipeline to mark it as inactive
+	pipeline.contains_out_of_place_primitive = false;
+	pipeline.containers_size = 0;
+	pipeline.size_of_data_type = 0;
+
+	pipeline.stages.clear();
+	pipeline.opcodes.clear();
+	pipeline.accessed_coordinates.clear();
+	pipeline.input_vectors.clear();
+	pipeline.output_vectors.clear();
+	pipeline.vxm_input_vectors.clear();
+	pipeline.input_matrices.clear();
+	pipeline.out_of_place_output_coordinates.clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	pipeline.already_dense_coordinates.clear();
+#endif
+	pipeline.dense_descr_coordinates.clear();
+
+	// no action is requred regarding the input_output_intersection that is only
+	// used temporarily during the pipeline execution
+
+	warnIfExceeded();
+}
+
+void Pipeline::clear() {
+	// after executing the pipeline, the size of vectors should be reset to
+	// indicate an empty pipeline
+	contains_out_of_place_primitive = false;
+	containers_size = 0;
+	size_of_data_type = 0;
+
+	stages.clear();
+	opcodes.clear();
+	accessed_coordinates.clear();
+	input_vectors.clear();
+	output_vectors.clear();
+	vxm_input_vectors.clear();
+	input_matrices.clear();
+	input_output_intersection.clear();
+	out_of_place_output_coordinates.clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	already_dense_coordinates.clear();
+#endif
+	dense_descr_coordinates.clear();
+}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+void Pipeline::buildAlreadyDenseVectors() {
+	all_already_dense_vectors = true;
+
+	// the intersection of the sets "dense_descr_coordinates" and
+	// "accessed_coordinates" is usually empty except for the output of an
+	// out-of-place operation that is always added in the set of
+	// "accessed_coordinates" regardless the dense descriptor
+	for(
+		std::set< Coordinates< nonblocking > * >::iterator it =
+			dense_descr_coordinates.begin();
+		it != dense_descr_coordinates.end(); ++it
+	) {
+		if( ( *it )->isDense() ) {
+			already_dense_coordinates.insert( *it );
+		} else {
+			all_already_dense_vectors = false;
+		}
+	}
+
+	for(
+		std::set< Coordinates< nonblocking > * >::iterator it =
+			accessed_coordinates.begin();
+		it != accessed_coordinates.end(); ++it
+	) {
+		if( ( *it )->isDense() ) {
+			already_dense_coordinates.insert( *it );
+		} else {
+			all_already_dense_vectors = false;
+		}
+	}
+}
+#endif
+
+grb::RC Pipeline::verifyDenseDescriptor() {
+#ifdef _NONBLOCKING_DEBUG
+	std::cout << "dense descriptor verification using "
+		<< dense_descr_coordinates.size()
+		<< " accessed vector(s) in the executed pipeline" << std::endl;
+#endif
+
+	// the coordinates for all vectors that are accessed in a primitive with the
+	// dense descriptor should be dense after the execution of the pipeline
+	//
+	// otherwise, the dense descriptor was used illegally for vectors that were
+	// not dense
+	//
+	// for all primitives with the dense descriptor, the coordinates of all
+	// vectors are added in the set "dense_descr_coordinates" for which the local
+	// coordinates are not buit, not accessed, and not updated
+	//
+	// therefore, a sparse vector can become dense only if the same vector is used
+	// in different primitives, i.e., with and without the dense descriptor
+	//
+	// this is a case that the dense descriptor may be used illegally, but the
+	// following code cannot catch it
+	for(
+		std::set< Coordinates<nonblocking > * >::iterator it =
+			dense_descr_coordinates.begin();
+		it != dense_descr_coordinates.end(); ++it
+	) {
+		if( !( *it )->isDense() ) {
+			return ILLEGAL;
+		}
+	}
+
+	return SUCCESS;
+}
+
+grb::RC Pipeline::execution() {
+	RC ret = SUCCESS;
+
+	// if the pipeline is empty, nothing needs to be executed
+	if( pbegin() == pend() ) {
+		return ret;
+	}
+
+	// if the pipeline operates on empty vectors, nothings needs to be executed
+	// all operations stored in the pipeline are cleared and the function returns
+	// immediately
+	if( containers_size == 0 ) {
+		clear();
+		return ret;
+	}
+
+	// compute the intersection of the input and output vectors that should be
+	// subtracted from the number of accessed vectors
+	std::set_intersection(
+		input_vectors.begin(), input_vectors.end(),
+		output_vectors.begin(), output_vectors.end(),
+		std::back_inserter( input_output_intersection )
+	);
+
+	const size_t num_accessed_vectors = input_vectors.size() +
+		output_vectors.size() - input_output_intersection.size();
+
+	assert( num_accessed_vectors > 0 );
+
+	// make use of the analytic model to estimate a proper number of threads and a
+	// tile size
+	AnalyticModel am( size_of_data_type, containers_size, num_accessed_vectors );
+
+	const size_t nthreads = am.getNumThreads();
+	const size_t tile_size = am.getTileSize();
+	const size_t num_tiles = am.getNumTiles();
+
+#ifdef _NONBLOCKING_DEBUG
+	std::cout << std::endl << "Analytic Model: threads(" << nthreads
+		<< "), tile_size(" << tile_size << "), num_tiles(" << num_tiles
+		<< ")" << std::endl;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+	// build the set of already dense vectors that will be used for optimizations
+	// by each primitive during the execution of the pipeline
+	buildAlreadyDenseVectors();
+#else
+	all_already_dense_vectors = true;
+
+	for(
+		std::set< internal::Coordinates< nonblocking > * >::iterator vt =
+			vbegin();
+		vt != vend(); ++vt
+	) {
+		if( (**vt).isDense() == false ) {
+			all_already_dense_vectors = false;
+		}
+	}
+#endif
+
+#ifdef _NONBLOCKING_DEBUG
+	std::cout << std::endl << "Pipeline execution: stages(" << stages.size()
+		<< "), accessed vectors(" << num_accessed_vectors
+		<< "), accessed coordinates(" << accessed_coordinates.size()
+		<< "), input vectors(" << input_vectors.size()
+		<< "), output vectors(" << output_vectors.size()
+		<< "), size of vectors(" << containers_size
+		<< "), threads(" << nthreads
+		<< "), tile size(" << tile_size
+		<< ")" << std::endl;
+#endif
+
+	lower_bound.resize( num_tiles );
+	upper_bound.resize( num_tiles );
+
+	// if all vectors are already dense and there is no out-of-place operation to
+	// make them sparse we avoid paying the overhead for updating the coordinates
+	// for the output vectors
+	if( all_already_dense_vectors && !contains_out_of_place_primitive ) {
+		// each thread should receive an identifier during the execution of the loop
+
+#ifndef GRB_ALREADY_DENSE_OPTIMIZATION
+		for(
+			std::set< internal::Coordinates< nonblocking > * >::iterator vt = vbegin();
+			vt != vend(); ++vt
+		) {
+			if ( (**vt).size() != getContainersSize() ) {
+				continue;
+			}
+
+			(**vt).localCoordinatesInit( am );
+		}
+#endif
+
+		#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+		for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+
+			// compute the lower and upper bounds
+			config::OMP::localRange(
+				lower_bound[ tile_id ], upper_bound[ tile_id ],
+				0, containers_size, tile_size, tile_id, num_tiles
+			);
+			assert( lower_bound[ tile_id ] <= upper_bound[ tile_id ] );
+
+#ifndef GRB_ALREADY_DENSE_OPTIMIZATION
+			for(
+				std::set< internal::Coordinates< nonblocking > * >::iterator vt = vbegin();
+				vt != vend(); ++vt
+			) {
+				if ( (**vt).size() != getContainersSize() ) {
+					continue;
+				}
+
+				(**vt).asyncSubsetInit( lower_bound[ tile_id ], upper_bound[ tile_id ] );
+			}
+#endif
+
+			RC local_ret = SUCCESS;
+			for( std::vector< stage_type >::iterator pt = pbegin();
+				pt != pend(); ++pt
+			) {
+				local_ret = local_ret
+					? local_ret
+					: (*pt)( *this, lower_bound[ tile_id ], upper_bound[ tile_id ] );
+			}
+			if( local_ret != SUCCESS ) {
+				ret = local_ret;
+			}
+		}
+	} else {
+
+		bool initialized_coordinates = false;
+
+		for(
+			std::set< internal::Coordinates< nonblocking > * >::iterator vt = vbegin();
+			vt != vend(); ++vt
+		) {
+
+			if ( (**vt).size() != getContainersSize() ) {
+				continue;
+			}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+			if( (**vt).isDense() && (
+				!contains_out_of_place_primitive || !outOfPlaceOutput( *vt )
+			) ) {
+				continue;
+			}
+#endif
+			(**vt).localCoordinatesInit( am );
+		}
+
+		#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+		for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+
+			config::OMP::localRange(
+				lower_bound[ tile_id ], upper_bound[ tile_id ],
+				0, containers_size, tile_size, tile_id, num_tiles
+			);
+			assert( lower_bound[ tile_id ] <= upper_bound[ tile_id ] );
+
+			for(
+				std::set< internal::Coordinates< nonblocking > * >::iterator vt = vbegin();
+				vt != vend(); ++vt
+			) {
+
+				// skip the initialization of coordinates of different size, which may
+				// happen only for the input of vxm_generic as it's read-only for the
+				// current design
+				// namely, no stage of the same pipeline can overwrite it
+				if ( (**vt).size() != getContainersSize() ) {
+					continue;
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				if( (**vt).isDense() && (
+					!contains_out_of_place_primitive || !outOfPlaceOutput( *vt )
+				) ) {
+					continue;
+				}
+#endif
+
+				(**vt).asyncSubsetInit( lower_bound[ tile_id ], upper_bound[ tile_id ] );
+				initialized_coordinates = true;
+			}
+		}
+
+		// even if only one vector is sparse, we cannot reuse memory because the first
+		// two arguments that we pass to the lambda functions determine whether we
+		// reuse memory or not and they cannot vary for different vectors
+		#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+		for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+
+			RC local_ret = SUCCESS;
+			for( std::vector< stage_type >::iterator pt = pbegin();
+				pt != pend(); ++pt
+			) {
+				local_ret = local_ret
+					? local_ret
+					: (*pt)( *this, lower_bound[ tile_id ], upper_bound[ tile_id ] );
+			}
+			if( local_ret != SUCCESS ) {
+				ret = local_ret;
+			}
+		}
+
+		if( initialized_coordinates ) {
+			bool new_nnz = false;
+
+			// compute the prefix sums for each vector and store them in the last part of
+			// _buffer
+			// the computation for different vectors may run in parallel but is not
+			// preferred to avoid high overhead
+			for(
+				std::set< internal::Coordinates< nonblocking > * >::iterator vt = vbegin();
+				vt != vend(); ++vt
+			) {
+
+				// skip as done for the initialization
+				if ( (**vt).size() != getContainersSize() ) {
+					continue;
+				}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+				// skip as done for the initialization
+				if( (**vt).isDense() && (
+					!contains_out_of_place_primitive || !outOfPlaceOutput( *vt )
+				) ) {
+					continue;
+				}
+#endif
+
+				if( (**vt).newNonZeroes() ) {
+					new_nnz = true;
+					(**vt).prefixSumComputation();
+				}
+			}
+
+			if( new_nnz ) {
+				#pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+				for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+					for(
+						std::set< internal::Coordinates< nonblocking > * >::iterator vt =
+							vbegin();
+						vt != vend(); ++vt
+					) {
+
+						// skip as done for the initialization
+						if ( (**vt).size() != getContainersSize() ) {
+							continue;
+						}
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+						// skip as done for the initialization
+						if( (**vt).isDense() && (
+							!contains_out_of_place_primitive || !outOfPlaceOutput( *vt )
+						) ) {
+							continue;
+						}
+#endif
+
+						if( (**vt).newNonZeroes() ) {
+							(**vt).joinSubset( lower_bound[ tile_id ], upper_bound[ tile_id ] );
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// verify that the dense descriptor was legally used
+	ret = ret ? ret : verifyDenseDescriptor();
+
+#ifdef _NONBLOCKING_DEBUG
+	if( ret == ILLEGAL ) {
+		std::cerr << "error in pipeline execution: the dense descriptor was "
+			<< "illegally used" << std::endl;
+	}
+#endif
+
+	clear();
+
+	return ret;
+}
+
diff --git a/src/graphblas/reference/CMakeLists.txt b/src/graphblas/reference/CMakeLists.txt
index 9f99364e5..6e56a8e29 100644
--- a/src/graphblas/reference/CMakeLists.txt
+++ b/src/graphblas/reference/CMakeLists.txt
@@ -22,174 +22,18 @@
 # in its compilation interface.
 #
 
-assert_valid_variables( BACKEND_LIBRARY_OUTPUT_NAME VERSION
-	SHMEM_BACKEND_INSTALL_DIR INCLUDE_INSTALL_DIR
+assert_valid_variables( SHMEM_BACKEND_INSTALL_DIR
 	REFERENCE_BACKEND_DEFAULT_NAME REFERENCE_OMP_BACKEND_DEFAULT_NAME
-	REFERENCE_SELECTION_DEFS REFERENCE_OMP_SELECTION_DEFS backend_reference_srcs
+	REFERENCE_SELECTION_DEFS REFERENCE_OMP_SELECTION_DEFS
 )
 
-assert_defined_targets( backend_flags )
+assert_defined_variables( backend_reference_srcs )
 
-# macro to create a target for the shared memory library, with all minimal properties
-#
-# Parameters:
-#   targetName name of the target
-#   targetType type of library (STATIC or SHARED)
-#   outDir directory to compile the binary to (not install)
-#
-macro( make_reference_target targetName targetType outDir )
-
-	add_library( "${targetName}" "${targetType}"
-		"${backend_reference_srcs}"
-	)
-	set_target_properties( "${targetName}" PROPERTIES
-		OUTPUT_NAME "${BACKEND_LIBRARY_OUTPUT_NAME}"
-	)
-	if( "${targetType}" STREQUAL "SHARED" )
-		set_target_properties( "${targetName}" PROPERTIES
-			SOVERSION "${VERSION}"
-			LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${outDir}"
-		)
-	elseif( "${targetType}" STREQUAL "STATIC" )
-		set_target_properties( "${targetName}" PROPERTIES
-			ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${outDir}"
-		)
-	else()
-		message( FATAL_ERROR "Unknown library type: ${targetType}")
-	endif()
-	target_link_libraries( "${targetName}" PRIVATE backend_flags )
-
-endmacro( make_reference_target )
-
-
-### BINARY-ONLY TARGETS: create the actual library with all code inside
-### but storing only the minimum interface definitions (e.g. no OMP);
-### no default backend is set!
-
-if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.19.0" )
-
-	# alias target for basic propagation of headers and definitions
-	# from existing header target, which depends on the enabled backends;
-	# use the most advanced ones, i.e. those of reference_omp if available
-	if( WITH_OMP_BACKEND_HEADERS )
-		set( backend_shmem_base_headers backend_reference_omp_headers)
-	elseif( WITH_REFERENCE_BACKEND_HEADERS )
-		set( backend_shmem_base_headers backend_reference_headers )
-	endif()
-
-	## STATIC
-	make_reference_target( backend_shmem_static STATIC "shmem" )
-	target_link_libraries( backend_shmem_static PRIVATE ${backend_shmem_base_headers} )
-	# this is the actual binary file, i.e. the one to be installed
-	install( TARGETS backend_shmem_static
-		EXPORT GraphBLASTargets
-		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	## DYNAMIC
-	make_reference_target( backend_shmem_shared SHARED "shmem" )
-	target_link_libraries( backend_shmem_shared PRIVATE ${backend_shmem_base_headers} )
-
-	install( TARGETS backend_shmem_shared
-		EXPORT GraphBLASTargets
-		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	add_dependencies( libs backend_shmem_static )
-	add_dependencies( libs backend_shmem_shared )
-endif()
-
-
-### CONSUMABLE TARGETS
-### i.e. targets with the default backend already set in the compilation interface:
-### if you import one of them, than it is already selected as a backend
-
-if( WITH_REFERENCE_BACKEND )
 
-	## reference static
-	if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.19.0" )
-		# with CMake 3.19 or higher interface targets can inherit output properties like filename;
-		# hence, just do that and link the relevant libraries as interface dependencies
-		add_library( backend_reference_static INTERFACE )
-		target_link_libraries( backend_reference_static INTERFACE backend_shmem_static )
-		target_link_libraries( backend_reference_static INTERFACE backend_reference_headers )
-	else()
-		# otherwise, we must create a dedicated binary
-		make_reference_target( backend_reference_static STATIC "reference" )
-		target_link_libraries( backend_reference_static PUBLIC backend_reference_headers )
-		add_dependencies( libs backend_reference_static )
-	endif()
-	# add the suitable definitions for backend selection for users' targets to get them automatically
-	target_compile_definitions( backend_reference_static INTERFACE "${REFERENCE_SELECTION_DEFS}" )
-
-	install( TARGETS backend_reference_static
-		EXPORT GraphBLASTargets
-		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	## reference shared
-	if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.19.0" )
-		add_library( backend_reference_shared INTERFACE )
-		target_link_libraries( backend_reference_shared INTERFACE backend_shmem_shared )
-		target_link_libraries( backend_reference_shared INTERFACE backend_reference_headers )
-	else()
-		make_reference_target( backend_reference_shared SHARED "reference" )
-		target_link_libraries( backend_reference_shared PUBLIC backend_reference_headers )
-		add_dependencies( libs backend_reference_shared )
-	endif()
-	target_compile_definitions( backend_reference_shared INTERFACE "${REFERENCE_SELECTION_DEFS}" )
-
-	install( TARGETS backend_reference_shared
-		EXPORT GraphBLASTargets
-		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	# this is an alias for add_grb_executables() to select the backend to link against
-	# DO NOT CHANGE THE ALIAS NAME!
-	add_library( "${REFERENCE_BACKEND_DEFAULT_NAME}" ALIAS backend_reference_static )
-
-endif( WITH_REFERENCE_BACKEND )
-
-
-if( WITH_OMP_BACKEND )
-
-	## reference_omp static
-	if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.19.0" )
-		add_library( backend_reference_omp_static INTERFACE )
-		target_link_libraries( backend_reference_omp_static INTERFACE backend_shmem_static )
-		target_link_libraries( backend_reference_omp_static INTERFACE backend_reference_omp_headers )
-	else()
-		make_reference_target( backend_reference_omp_static STATIC "reference_omp" )
-		target_link_libraries( backend_reference_omp_static PUBLIC backend_reference_omp_headers )
-		add_dependencies( libs backend_reference_omp_static )
-	endif()
-	target_compile_definitions( backend_reference_omp_static INTERFACE "${REFERENCE_OMP_SELECTION_DEFS}" )
-
-	install( TARGETS backend_reference_omp_static
-		EXPORT GraphBLASTargets
-		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	## reference_omp shared
-	if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.19.0" )
-		add_library( backend_reference_omp_shared INTERFACE )
-		target_link_libraries( backend_reference_omp_shared INTERFACE backend_shmem_shared )
-		target_link_libraries( backend_reference_omp_shared INTERFACE backend_reference_omp_headers )
-	else()
-		make_reference_target( backend_reference_omp_shared SHARED "reference_omp" )
-		target_link_libraries( backend_reference_omp_shared PUBLIC backend_reference_omp_headers )
-		add_dependencies( libs backend_reference_omp_shared )
-	endif()
-	target_compile_definitions( backend_reference_omp_shared INTERFACE "${REFERENCE_OMP_SELECTION_DEFS}" )
-
-	install( TARGETS backend_reference_omp_shared
-		EXPORT GraphBLASTargets
-		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
-	)
-
-	# this is an alias for add_grb_executables() to select the backend to link against
-	# DO NOT CHANGE THE ALIAS NAME!
-	add_library( "${REFERENCE_OMP_BACKEND_DEFAULT_NAME}" ALIAS backend_reference_omp_static )
-
-endif( WITH_OMP_BACKEND )
+set( backend_reference_srcs ${backend_reference_srcs}
+	${CMAKE_CURRENT_SOURCE_DIR}/init.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/config.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/io.cpp
+	PARENT_SCOPE
+)
 
diff --git a/src/transition/sparseblas.cpp b/src/transition/sparseblas.cpp
index dc7c2ec60..60fb1f055 100644
--- a/src/transition/sparseblas.cpp
+++ b/src/transition/sparseblas.cpp
@@ -296,7 +296,7 @@ namespace sparseblas {
 						"capacity" );
 				}
 				const grb::RC rc = grb::buildVector(
-					*vector, 
+					*vector,
 					uc_inds.cbegin(), uc_inds.cend(),
 					uc_vals.cbegin(), uc_vals.cend(),
 					grb::SEQUENTIAL
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6957b3fb8..2608e8c8d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -21,6 +21,109 @@ assert_valid_variables( ALP_UTILS_SRC_PATH AVAILABLE_BACKENDS DATASETS_DIR TESTS
 assert_defined_variables( GNN_DATASET_PATH WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND )
 
 
+### CONSUMABLE TARGETS
+### i.e. targets with the default backend already set in the compilation interface:
+### if you import one of them, than it is already selected as a backend
+
+if( WITH_REFERENCE_BACKEND )
+
+	assert_defined_targets( backend_shmem_static )
+
+	## reference static
+	add_library( backend_reference_static INTERFACE )
+	target_link_libraries( backend_reference_static INTERFACE backend_shmem_static )
+	target_link_libraries( backend_reference_static INTERFACE backend_reference_headers )
+	# add the suitable definitions for backend selection for users' targets to get them automatically
+	target_compile_definitions( backend_reference_static INTERFACE "${REFERENCE_SELECTION_DEFS}" )
+
+	install( TARGETS backend_reference_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	## reference shared
+	add_library( backend_reference_shared INTERFACE )
+	target_link_libraries( backend_reference_shared INTERFACE backend_shmem_shared )
+	target_link_libraries( backend_reference_shared INTERFACE backend_reference_headers )
+	target_compile_definitions( backend_reference_shared INTERFACE "${REFERENCE_SELECTION_DEFS}" )
+
+	install( TARGETS backend_reference_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	# this is an alias for add_grb_executables() to select the backend to link against
+	# DO NOT CHANGE THE ALIAS NAME!
+	add_library( "${REFERENCE_BACKEND_DEFAULT_NAME}" ALIAS backend_reference_static )
+
+endif( WITH_REFERENCE_BACKEND )
+
+
+if( WITH_OMP_BACKEND )
+
+	assert_defined_targets( backend_shmem_static )
+
+	## reference_omp static
+	add_library( backend_reference_omp_static INTERFACE )
+	target_link_libraries( backend_reference_omp_static INTERFACE backend_shmem_static )
+	target_link_libraries( backend_reference_omp_static INTERFACE backend_reference_omp_headers )
+	target_compile_definitions( backend_reference_omp_static INTERFACE "${REFERENCE_OMP_SELECTION_DEFS}" )
+
+	install( TARGETS backend_reference_omp_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	## reference_omp shared
+	add_library( backend_reference_omp_shared INTERFACE )
+	target_link_libraries( backend_reference_omp_shared INTERFACE backend_shmem_shared )
+	target_link_libraries( backend_reference_omp_shared INTERFACE backend_reference_omp_headers )
+	target_compile_definitions( backend_reference_omp_shared INTERFACE "${REFERENCE_OMP_SELECTION_DEFS}" )
+
+	install( TARGETS backend_reference_omp_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	# this is an alias for add_grb_executables() to select the backend to link against
+	# DO NOT CHANGE THE ALIAS NAME!
+	add_library( "${REFERENCE_OMP_BACKEND_DEFAULT_NAME}" ALIAS backend_reference_omp_static )
+
+endif( WITH_OMP_BACKEND )
+
+
+if( WITH_NONBLOCKING_BACKEND )
+
+	assert_defined_targets( backend_shmem_static )
+
+	## nonblocking static
+	add_library( backend_nonblocking_static INTERFACE )
+	target_link_libraries( backend_nonblocking_static INTERFACE backend_shmem_static )
+	target_link_libraries( backend_nonblocking_static INTERFACE backend_nonblocking_headers )
+	target_compile_definitions( backend_nonblocking_static INTERFACE "${NONBLOCKING_SELECTION_DEFS}" )
+
+	install( TARGETS backend_nonblocking_static
+		EXPORT GraphBLASTargets
+		ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	## nonblocking shared
+	add_library( backend_nonblocking_shared INTERFACE )
+	target_link_libraries( backend_nonblocking_shared INTERFACE backend_shmem_shared )
+	target_link_libraries( backend_nonblocking_shared INTERFACE backend_nonblocking_headers )
+	target_compile_definitions( backend_nonblocking_shared INTERFACE "${NONBLOCKING_SELECTION_DEFS}" )
+
+	install( TARGETS backend_nonblocking_shared
+		EXPORT GraphBLASTargets
+		LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}"
+	)
+
+	# this is an alias for add_grb_executables() to select the backend to link against
+	# DO NOT CHANGE THE ALIAS NAME!
+	add_library( "${NONBLOCKING_BACKEND_DEFAULT_NAME}" ALIAS backend_nonblocking_static )
+
+endif( WITH_NONBLOCKING_BACKEND )
+
 
 # library with utilities for tests, to be used optionally
 # i.e. NOT linked by default
diff --git a/tests/performance/CMakeLists.txt b/tests/performance/CMakeLists.txt
index cf6f1ca81..a556dc234 100644
--- a/tests/performance/CMakeLists.txt
+++ b/tests/performance/CMakeLists.txt
@@ -58,23 +58,38 @@ add_grb_executables( dot-openmp dot.cpp $<TARGET_OBJECTS:bench_kernels_omp>
 
 add_grb_executables( scaling scaling.cpp
 	../unit/parser.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( driver_knn ../smoke/knn.cpp
 	../unit/parser.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( driver_simple_pagerank
 	../smoke/simple_pagerank.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( driver_label ../smoke/label.cpp
 	../unit/parser.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
+add_grb_executables( driver_spmv spmv.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
+add_grb_executables( driver_spmspv spmspv.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+)
+
+add_grb_executables( driver_spmspm spmspm.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
diff --git a/tests/performance/performancetests.sh b/tests/performance/performancetests.sh
index ec046e4e7..a2743137f 100755
--- a/tests/performance/performancetests.sh
+++ b/tests/performance/performancetests.sh
@@ -38,8 +38,14 @@ echo "Info: script called with the following arguments: ${DATASETTORUN} ${EXPTYP
 #number of sockets of machine
 if [[ -z ${NUM_SOCKETS} ]]; then
 	NUM_SOCKETS=`grep -i "physical id" /proc/cpuinfo | sort -u | wc -l`
+	echo "Info: number of sockets detected is ${NUM_SOCKETS}"
 fi
-echo "Info: number of sockets detected is ${NUM_SOCKETS}"
+if [ "${NUM_SOCKETS}" -eq "0" ]; then
+	echo "Warning: failed to auto-detect the number of sockets, assuming 1;"
+	echo "         if incorrect, please set NUM_SOCKETS manually."
+	NUM_SOCKETS=1
+fi
+echo "Info: selected number of sockets is ${NUM_SOCKETS}"
 
 if [[ -z ${MAX_PROCESSES} ]]; then
 	echo "Info: MAX_PROCESSES was not set. Will set it equal to the number of sockets."
@@ -64,8 +70,11 @@ fi
 DATASETS=(west0497.mtx facebook_combined.txt cit-HepTh.txt com-amazon.ungraph.txt com-youtube.ungraph.txt cit-Patents.txt com-orkut.ungraph.txt)
 DATASET_MODES=(direct direct indirect indirect indirect indirect indirect)
 DATASET_SIZES=(497 4039 27770 334863 1134890 3774768 3072441)
-KNN4SOLS=(118 499 2805 1 64 1 1048907)
-KNN6SOLS=(357 547 5176 1 246 1 1453447)
+KNN4SOLS=(59 421 1138 1 32 1 609122)
+KNN6SOLS=(238 526 4189 1 181 1 1268035)
+
+#the following datasets are used for benchmarking SpMV, SpMSpV, and SpMSpM
+MULTIPLICATION_DATASETS=(west0497.mtx fidap037.mtx cavity17.mtx s3rmt3m3.mtx bloweybq.mtx bcsstk17.mtx Pres_Poisson.mtx gyro_m.mtx memplus.mtx lhr34.mtx bcsstk32.mtx vanbody.mtx s3dkt3m2.mtx G2_circuit.mtx Stanford.mtx coPapersCiteseer.mtx bundle_adj.mtx Stanford_Berkeley.mtx apache2.mtx Emilia_923.mtx ldoor.mtx ecology2.mtx Serena.mtx cage14.mtx G3_circuit.mtx wikipedia-20051105.mtx wikipedia-20061104.mtx Freescale1.mtx wikipedia-20070206.mtx Queen_4147.mtx cage15.mtx adaptive.mtx rgg_n_2_24_s0.mtx uk-2002.mtx road_usa.mtx MOLIERE_2016.mtx europe_osm.mtx twitter.mtx com-Friendster.mtx)
 
 #which command to use to run a GraphBLAS program
 LPF=yes
@@ -195,7 +204,11 @@ function runScalingTest()
 {
 	local runner=$1
 	local backend=$2
-	local DATASETS=( 1000 1000000 10000000 )
+	if [ "${backend}" = "hyperdags" ]; then
+		local DATASETS=( 1000 )
+	else
+		local DATASETS=( 1000 1000000 10000000 )
+	fi
 	local TESTS=(1 2 3 4)
 
 	for ((d=0;d<${#DATASETS[@]};++d));
@@ -267,6 +280,81 @@ runOtherBenchMarkTests()
 	echo >> ${TEST_OUT_DIR}/benchmarks
 }
 
+runMultiplicationKernels()
+{
+	local runner=$1
+	local backend=$2
+	local dataSet=$3
+	local parseMode=$4
+	local i=$5
+
+	# the check for the matrices existence is assumed to have already passed
+
+	if [ -z "$EXPTYPE" ] || [ "$EXPTYPE" == "SPMV" ]; then
+
+		# ---------------------------------------------------------------------
+		# spmv
+		echo ">>>      [ ]           [x]       Testing spmv using ${dataSet} dataset, $backend backend."
+		echo
+		$runner ${TEST_BIN_DIR}/driver_spmv_${backend} ${INPUT_DIR}/${dataSet} ${parseMode} &> ${TEST_OUT_DIR}/driver_spmv_${backend}_${dataSet}
+		head -1 ${TEST_OUT_DIR}/driver_spmv_${backend}_${dataSet}
+		if grep -q "Test OK" ${TEST_OUT_DIR}/driver_spmv_${backend}_${dataSet}; then
+			printf "Test OK\n\n"
+		else
+			printf "Test FAILED\n\n"
+		fi
+		echo "$backend spmv using the ${dataSet} dataset" >> ${TEST_OUT_DIR}/benchmarks
+		egrep 'Avg|Std' ${TEST_OUT_DIR}/driver_spmv_${backend}_${dataSet} >> ${TEST_OUT_DIR}/benchmarks
+		echo >> ${TEST_OUT_DIR}/benchmarks
+
+	fi
+
+	if [ -z "$EXPTYPE" ] || [ "$EXPTYPE" == "SPMSPV" ]; then
+
+		# ---------------------------------------------------------------------
+		# spmspv
+		echo ">>>      [ ]           [x]       Testing spmspv using ${dataSet} dataset, $backend backend."
+		echo
+		$runner ${TEST_BIN_DIR}/driver_spmspv_${backend} ${INPUT_DIR}/${dataSet} ${parseMode} &> ${TEST_OUT_DIR}/driver_spmspv_${backend}_${dataSet}
+		head -1 ${TEST_OUT_DIR}/driver_spmspv_${backend}_${dataSet}
+		if grep -q "Test OK" ${TEST_OUT_DIR}/driver_spmspv_${backend}_${dataSet}; then
+			printf "Test OK\n\n"
+		else
+			printf "Test FAILED\n\n"
+		fi
+		echo "$backend spmspv using the ${dataSet} dataset" >> ${TEST_OUT_DIR}/benchmarks
+		egrep 'Avg|Std' ${TEST_OUT_DIR}/driver_spmspv_${backend}_${dataSet} >> ${TEST_OUT_DIR}/benchmarks
+		echo >> ${TEST_OUT_DIR}/benchmarks
+
+	fi
+
+	if [ -z "$EXPTYPE" ] || [ "$EXPTYPE" == "SPMSPM" ]; then
+
+		# ---------------------------------------------------------------------
+		# spmspm
+		echo ">>>      [ ]           [x]       Testing spmspm using ${dataSet} dataset, $backend backend."
+		echo
+		if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
+			echo "Test DISABLED: no sparse level-3 operations recommended for 1D distributions."
+			echo " "
+		elif [ "$i" -gt "14" ]; then
+			echo "Tests DISABLED: by default, long-running sparse matrix--sparse matrix multiplications are disabled (skipping dataset ${dataSet})."
+			echo " "
+		else
+			$runner ${TEST_BIN_DIR}/driver_spmspm_${backend} ${INPUT_DIR}/${dataSet} ${INPUT_DIR}/${dataSet} ${parseMode} &> ${TEST_OUT_DIR}/driver_spmspm_${backend}_${dataSet}
+			head -1 ${TEST_OUT_DIR}/driver_spmspm_${backend}_${dataSet}
+			if grep -q "Test OK" ${TEST_OUT_DIR}/driver_spmspm_${backend}_${dataSet}; then
+				printf "Test OK\n\n"
+			else
+				printf "Test FAILED\n\n"
+			fi
+			echo "$backend spmspm using the ${dataSet} dataset" >> ${TEST_OUT_DIR}/benchmarks
+			egrep 'Avg|Std' ${TEST_OUT_DIR}/driver_spmspm_${backend}_${dataSet} >> ${TEST_OUT_DIR}/benchmarks
+			echo >> ${TEST_OUT_DIR}/benchmarks
+		fi
+	fi
+}
+
 # end helper functions
 
 if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
@@ -286,7 +374,9 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 			    # BSP1D otherwise is never used for a performance test; hybrid(1D)
 			    # should be used instead.
 		fi
-		if [ "$BACKEND" = "reference_omp" ] ; then
+		if [ "$BACKEND" = "reference_omp" ]; then
+			T=${MAX_THREADS}
+		elif [ "$BACKEND" = "nonblocking" ]; then
 			T=${MAX_THREADS}
 		elif [ "$BACKEND" = "hybrid" ]; then
 			T=$((MAX_THREADS/NUM_SOCKETS))
@@ -308,7 +398,7 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 				runner="${runner} ${MPI_PASS_ENV} ${LPFRUN_PASSTHROUGH}OMP_NUM_THREADS=${T} ${MPI_BINDING_ARGS}"
 			fi
 		fi
-		if [ "$BACKEND" = "reference_omp" ]; then
+		if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "nonblocking" ]; then
 			export OMP_NUM_THREADS=${T}
 		fi
 
@@ -332,6 +422,11 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 
 		for ((i=0;i<${#DATASETS[@]};++i));
 		do
+			if [ "$BACKEND" = "hyperdags" ] && [ "$i" -gt "0" ]; then
+				echo "Info: hyperdags performance tests run only on the smallest dataset"
+				echo " "
+				break
+			fi
 			if [ ! -z "$DATASETTORUN" ] && [ "$DATASETTORUN" != "${DATASETS[i]}" ]; then
 				continue
 			fi
@@ -345,7 +440,8 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 
 			# test for file
 			if [ ! -f ${INPUT_DIR}/${DATASET} ]; then
-				echo "Warning: dataset/${DATASET} not found. Provide the dataset to enable performance tests with it."
+				echo ">>>      [x]           [x]       Test algorithms using ${DATASET} dataset, ${BACKEND} backend."
+				echo "Tests DISABLED: dataset/${DATASET} not found. Provide the dataset to enable performance tests with it."
 				echo " "
 				continue
 			fi
@@ -356,10 +452,14 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 				# k-NN k=4
 				runKNNBenchMarkTests "$runner" "$BACKEND" 4 "$DATASET" "$PARSE_MODE" "$PARSE_SIZE" "$KNN4SOL"
 
-				# ---------------------------------------------------------------------
-				# k-NN k=6
-				runKNNBenchMarkTests "$runner" "$BACKEND" 6 "$DATASET" "$PARSE_MODE" "$PARSE_SIZE" "$KNN6SOL"
-
+				if [ "$BACKEND" = "hyperdags" ]; then
+					echo "Info: 6-NN is skipped for the hyperdags backend"
+					echo " "
+				else
+					# ---------------------------------------------------------------------
+					# k-NN k=6
+					runKNNBenchMarkTests "$runner" "$BACKEND" 6 "$DATASET" "$PARSE_MODE" "$PARSE_SIZE" "$KNN6SOL"
+				fi
 			fi
 			if [ -z "$EXPTYPE" ] || [ "$EXPTYPE" == "LABEL" ]; then
 
@@ -377,6 +477,35 @@ if [ -z "$EXPTYPE" ] || ! [ "$EXPTYPE" == "KERNEL" ]; then
 			fi
 		done
 
+		for ((i=0;i<${#MULTIPLICATION_DATASETS[@]};++i));
+		do
+			if [ ! -z "$DATASETTORUN" ] && [ "$DATASETTORUN" != "${MULTIPLICATION_DATASETS[i]}" ]; then
+				continue
+			fi
+
+			if [ "$BACKEND" = "hyperdags" ] && [ "$i" -gt "0" ]; then
+				echo "Info: hyperdags performance tests run only on the smallest dataset"
+				echo " "
+				break
+			fi
+
+			# initialise parameters
+			DATASET=${MULTIPLICATION_DATASETS[i]}
+			PARSE_MODE=direct
+
+			# test for file
+			if [ ! -f ${INPUT_DIR}/${DATASET} ]; then
+				echo ">>>      [ ]           [x]       Test multiplication kernels using ${DATASET} dataset,"
+				echo "                                 ${BACKEND} backend."
+				echo "Tests DISABLED: dataset/${DATASET} not found. Provide the dataset to enable performance tests with it."
+				echo " "
+				continue
+			fi
+
+			runMultiplicationKernels "$runner" "$BACKEND" "$DATASET" "$PARSE_MODE" "$i"
+
+		done
+
 	done
 
 fi
diff --git a/tests/performance/spmspm.cpp b/tests/performance/spmspm.cpp
new file mode 100644
index 000000000..9d8c7297d
--- /dev/null
+++ b/tests/performance/spmspm.cpp
@@ -0,0 +1,408 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @authors Anders Hansson, Aristeidis Mastoras, A. N. Yzelman
+ * @date May, 2022
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+struct input {
+	char filenameL[ 1024 ];
+	char filenameR[ 1024 ];
+	bool direct;
+	size_t rep;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	grb::utils::TimerResults times;
+	PinnedVector< double > pinnedVector;
+	size_t result_nnz;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filenameL[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input for left matrix." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	} else if( data_in.filenameR[ 0 ] == '\n' ) {
+		std::cerr << s << ": no file name given as input for right matrix." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	// create local parser
+	grb::utils::MatrixFileReader< double,
+		std::conditional< (sizeof(grb::config::RowIndexType) >
+				sizeof(grb::config::ColIndexType)),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parserL( data_in.filenameL, data_in.direct );
+
+	grb::utils::MatrixFileReader< double,
+		std::conditional< (sizeof(grb::config::RowIndexType) >
+				sizeof(grb::config::ColIndexType)),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parserR( data_in.filenameR, data_in.direct );
+
+	assert( parserL.n() == parserR.m() );
+
+	const size_t l = parserL.m();
+	const size_t m = parserL.n();
+	const size_t n = parserR.n();
+
+	out.times.io = timer.time();
+	timer.reset();
+
+	// load into GraphBLAS
+	Matrix< double > A( l, m ), B( m, n );
+	{
+		RC rc = buildMatrixUnique(
+			A,
+			parserL.begin( SEQUENTIAL ), parserL.end( SEQUENTIAL ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		const RC rc = buildMatrixUnique( A,
+			parser.begin( PARALLEL ), parser.end( PARALLEL),
+			PARALLEL
+		);*/
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed for the "
+				<< "left-hand matrix " << "(" << toString( rc ) << ")." << std::endl;
+			out.error_code = 10;
+			return;
+		}
+
+		rc = buildMatrixUnique(
+			B,
+			parserR.begin( SEQUENTIAL ), parserR.end( SEQUENTIAL ),
+			SEQUENTIAL
+		);
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed for the "
+				<< "right-hand matrix " << "(" << toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+			return;
+		}
+	}
+
+	// check number of nonzeroes
+	try {
+		const size_t global_nnzL = nnz( A );
+		const size_t global_nnzR = nnz( B );
+		const size_t parser_nnzL = parserL.nz();
+		const size_t parser_nnzR = parserR.nz();
+		if( global_nnzL != parser_nnzL ) {
+			std::cerr << "Left matrix Failure: global nnz (" << global_nnzL << ") "
+				<< "does not equal parser nnz (" << parser_nnzL << ")." << std::endl;
+			return;
+		} else if( global_nnzR != parser_nnzR ) {
+			std::cerr << "Right matrix Failure: global nnz (" << global_nnzR << ") "
+				<< "does not equal parser nnz (" << parser_nnzR << ")." << std::endl;
+			return;
+		}
+
+	} catch( const std::runtime_error & ) {
+		std::cout << "Info: nonzero check skipped as the number of nonzeroes "
+			<< "cannot be derived from the matrix file header. The "
+			<< "grb::Matrix reports " << nnz( A ) << " nonzeroes in left "
+			<< "and " << nnz( B ) << " n right \n";
+	}
+
+	RC rc = SUCCESS;
+
+	// test default SpMSpM run
+	const Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> ring;
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	// time a single call
+	{
+		Matrix< double > C( l, n );
+
+		grb::utils::Timer subtimer;
+		subtimer.reset();
+		rc = rc ? rc : grb::mxm( C, A, B, ring, RESIZE );
+		assert( rc == SUCCESS );
+		rc = rc ? rc : grb::mxm( C, A, B, ring );
+		assert( rc == SUCCESS );
+		double single_time = subtimer.time();
+
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to mxm did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 70;
+			return;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 80;
+			return;
+		}
+		out.times.useful = single_time;
+		const size_t deduced_inner_reps =
+			static_cast< size_t >( 100.0 / single_time ) + 1;
+		if( rc == SUCCESS && out.rep == 0 ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold mxm completed"
+					<< ". Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 1 second or more per inner benchmark.\n";
+				out.rep = deduced_inner_reps;
+			}
+			return;
+		}
+	}
+
+	if( out.rep > 1 ) {
+		std::cerr << "Error: more than 1 inner repetitions are not supported due to "
+			<< "having to time the symbolic phase while not timing the initial matrix "
+			<< "allocation cost\n";
+		out.error_code = 90;
+		return;
+	}
+
+	// allocate output for benchmark
+	Matrix< double > C( l, n );
+
+	// that was the preamble
+	out.times.preamble = timer.time();
+
+	// do benchmark
+	double time_taken;
+	timer.reset();
+
+#ifndef NDEBUG
+	rc = rc ? rc : grb::mxm( C, A, B, ring, RESIZE );
+	assert( rc == SUCCESS );
+	rc = rc ? rc : grb::mxm( C, A, B, ring );
+	assert( rc == SUCCESS );
+#else
+	(void) grb::mxm( C, A, B, ring, RESIZE );
+	(void) grb::mxm( C, A, B, ring );
+#endif
+
+	time_taken = timer.time();
+	if( rc == SUCCESS ) {
+		out.times.useful = time_taken / static_cast< double >( out.rep );
+	}
+	// print timing at root process
+	if( grb::spmd<>::pid() == 0 ) {
+		std::cout << "Time taken for a " << out.rep << " "
+			<< "mxm calls (hot start): " << out.times.useful << ". "
+			<< "Error code is " << out.error_code << std::endl;
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// finish timing
+	time_taken = timer.time();
+	out.times.postamble = time_taken;
+
+	int nnz = 0;
+	auto it = C.begin();
+	while( it != C.end() ) {
+		if((*it).second != 0.0){
+			nnz++;
+		}
+		it.operator++();
+	}
+
+	out.result_nnz = nnz;
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 7 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " <datasetL> <datasetR> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (verification <truth-file>)\n";
+		std::cout << "<datasetL>, <datasetR>, and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "<datasetL> is the left matrix of the multiplication and "
+			<< "<datasetR> is the right matrix \n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer() << ". "
+			<< "This value must be strictly larger than 0." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+#ifndef NDEBUG
+	std::cerr << "Warning: benchmark driver compiled without the NDEBUG macro "
+		<< "defined(!)\n";
+#endif
+
+	// the input struct
+	struct input in;
+
+	// get file name Left
+	(void)strncpy( in.filenameL, argv[ 1 ], 1023 );
+	in.filenameL[ 1023 ] = '\0';
+
+	// get file name Right
+	(void)strncpy( in.filenameR, argv[ 2 ], 1023 );
+	in.filenameL[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 3 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc >= 5 ) {
+		in.rep = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ] << " "
+				<< "for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 6 ) {
+		outer = strtoumax( argv[ 5 ], &end, 10 );
+		if( argv[ 5 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 4 ] << " "
+				<< "for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	std::cout << "Executable called with parameters:  Left matrix A = "
+		<< in.filenameL << ", right matrix B = " << in.filenameR << ", "
+		<< "inner repititions = " << in.rep
+		<< ", and outer reptitions = " << outer
+		<< std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< (int)rc << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	}
+
+	std::cout << "Error code is " << out.error_code << ".\n";
+
+	std::cout << "Number of non-zeroes in output matrix: "
+		<< out.result_nnz << "\n";
+
+	if( out.error_code == 0 && out.pinnedVector.size() > 0 ) {
+		std::cerr << std::fixed;
+		std::cerr << "Output matrix: (";
+		for( size_t k = 0; k < out.pinnedVector.nonzeroes(); k++ ) {
+			const auto &nonZeroValue = out.pinnedVector.getNonzeroValue( k );
+			std::cerr << nonZeroValue << ", ";
+		}
+		std::cerr << ")" << std::endl;
+		std::cerr << std::defaultfloat;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cerr << "Test FAILED\n";
+	} else {
+		std::cout << "Test OK\n";
+	}
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/performance/spmspv.cpp b/tests/performance/spmspv.cpp
new file mode 100644
index 000000000..a6e073b5d
--- /dev/null
+++ b/tests/performance/spmspv.cpp
@@ -0,0 +1,398 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @authors Anders Hansson, Aristeidis Mastoras, A. N. Yzelman
+ * @date May, 2022
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+struct input {
+	char filename[ 1024 ];
+	bool direct;
+	size_t rep;
+	int numelem;
+	char ** elements;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	grb::utils::TimerResults times;
+	PinnedVector< double > pinnedVector;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	// create local parser
+	grb::utils::MatrixFileReader< double,
+		std::conditional< (sizeof( grb::config::RowIndexType) >
+				sizeof(grb::config::ColIndexType )),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	const size_t n = parser.n();
+	const size_t m = parser.m();
+
+	out.times.io = timer.time();
+	timer.reset();
+
+	// load into GraphBLAS
+	Matrix< double > A( m, n );
+	{
+		const RC rc = buildMatrixUnique(
+			A,
+			parser.begin( SEQUENTIAL ), parser.end( SEQUENTIAL ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		const RC rc = buildMatrixUnique( A,
+			parser.begin( PARALLEL ), parser.end( PARALLEL),
+			PARALLEL
+		);*/
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( rc ) << ")." << std::endl;
+			return;
+		}
+	}
+
+	// check number of nonzeroes
+	try {
+		const size_t global_nnz = nnz( A );
+		const size_t parser_nnz = parser.nz();
+		if( global_nnz != parser_nnz ) {
+			std::cerr << "Failure: global nnz (" << global_nnz << ") does not equal "
+				<< "parser nnz (" << parser_nnz << ")." << std::endl;
+			return;
+		}
+	} catch( const std::runtime_error & ) {
+		std::cout << "Info: nonzero check skipped as the number of nonzeroes "
+			<< "cannot be derived from the matrix file header. The "
+			<< "grb::Matrix reports " << nnz( A ) << " nonzeroes.\n";
+	}
+
+	RC rc = SUCCESS;
+
+	// test default spmspv run
+	Vector< double > x( n ), y( m );
+
+	rc = rc ? rc : clear( x );
+	assert( rc == SUCCESS );
+
+	const Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> ring;
+
+	// read in the elements of the input vector, if none use default
+	if( data_in.numelem == 0 ) {
+		int pos = n / 2;
+		std::cout << "Setting default source value at position " << pos << "\n";
+		rc = setElement( x, 1.0, pos );
+		if( rc != 0 ) {
+			std::cerr << "Failed to insert entry at position " << pos << "\n";
+			out.error_code = 22;
+			return;
+		}
+	} else {
+		for( int k = 0; k < data_in.numelem; ++k ) {
+			size_t pos;
+			{
+				std::stringstream ss( std::string( data_in.elements[ k ] ) );
+				if( !(ss >> pos) ) {
+					std::cerr << "Error in parsing argument " << k << ": "
+						<< data_in.elements[ k ] << "\n";
+				}
+			}
+			if( pos >= n ) {
+
+				std::cerr << "Requested source position " << pos << " is invalid (max is " << n << ")\n";
+				out.error_code = 23;
+				return;
+			} else {
+				std::cout << "Setting default source value at position " << pos << "\n";
+
+				rc = setElement( x, 1.0, pos );
+				if( rc != 0 ) {
+					std::cerr << "Failed to insert entry at position " << pos << "\n";
+					out.error_code = 24;
+					return;
+				}
+			}
+		}
+	}
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	// time a single call
+	{
+		grb::utils::Timer subtimer;
+		subtimer.reset();
+
+		rc = rc ? rc : clear( y );
+		assert( rc == SUCCESS );
+
+		rc = rc ? rc : mxv( y, A, x, ring );
+		assert( rc == SUCCESS );
+
+		double single_time = subtimer.time();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to mxv did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		const size_t deduced_ideal_spmv_time =
+			static_cast< size_t >( 100.0 / single_time ) + 1;
+		if( rc == SUCCESS && out.rep == 0 ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold mxv completed"
+					<< ". Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 100 ms. or more per inner benchmark.\n";
+				out.rep = deduced_ideal_spmv_time;
+			}
+			return;
+		}
+	}
+
+	// that was the preamble
+	out.times.preamble = timer.time();
+
+	// do benchmark
+	double time_taken;
+	timer.reset();
+	for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+#ifndef NDEBUG
+		rc = rc ? rc : clear( y );
+		assert( rc == SUCCESS );
+		rc = rc ? rc : mxv( y, A, x, ring );
+		assert( rc == SUCCESS );
+#else
+		(void) clear( y );
+		(void) mxv( y, A, x, ring );
+#endif
+	}
+	time_taken = timer.time();
+	if( rc == SUCCESS ) {
+		out.times.useful = time_taken / static_cast< double >( out.rep );
+	}
+	// print timing at root process
+	if( grb::spmd<>::pid() == 0 ) {
+		std::cout << "Time taken for a " << out.rep << " "
+			<< "Mxv calls (hot start): " << out.times.useful << ". "
+			<< "Error code is " << out.error_code << std::endl;
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// output
+	out.pinnedVector = PinnedVector< double >( y, SEQUENTIAL );
+
+	// finish timing
+	time_taken = timer.time();
+	out.times.postamble = time_taken;
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " <dataset> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (source vertex 1) "
+			<< "(source vertex 2) ...\n";
+		std::cout << "<dataset> and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer() << ". "
+			<< "This value must be strictly larger than 0.\n";
+		std::cout << "(Source vertices 1, 2 ...) are optional and defines which "
+			<< "elements in the vectors are non-zero. "
+			<< "The default value for this is element n/2 is non-zero." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+#ifndef NDEBUG
+	std::cerr << "Warning: benchmark driver compiled without the NDEBUG macro "
+		<< "defined(!)\n";
+#endif
+
+	// the input struct
+	struct input in;
+
+	// get file name
+	(void)strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc >= 4 ) {
+		in.rep = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 2 ] << " "
+				<< "for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 5 ) {
+		outer = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ] << " "
+				<< "for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	// pass the rest of the elements to be read/constructed inside the function
+	in.numelem = argc - 5;
+	in.elements = argv + 5;
+
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.rep << ", and "
+		<< "outer reptitions = " << outer
+		<< std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< (int)rc << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	}
+
+	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Size of x is " << out.pinnedVector.size() << ".\n";
+	std::cout << "Number of non-zeroes are: "
+		<< out.pinnedVector.nonzeroes() << ".\n";
+	if( out.error_code == 0 && out.pinnedVector.nonzeroes() > 0 ) {
+		std::cerr << std::fixed;
+		std::cerr << "Output vector: ( ";
+		double nonzeroValue = out.pinnedVector.getNonzeroValue( 0 );
+		std::cerr << "{" << out.pinnedVector.getNonzeroIndex( 0 ) << ","
+			<< nonzeroValue << "} ";
+		for( size_t k = 1; k < out.pinnedVector.nonzeroes(); ++k ) {
+			nonzeroValue = out.pinnedVector.getNonzeroValue( k );
+			std::cerr << ", {" << out.pinnedVector.getNonzeroIndex( k ) << ","
+				<< nonzeroValue << "}";
+		}
+		std::cerr << " )" << std::endl;
+		std::cerr << std::defaultfloat;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cerr << "Test FAILED\n";
+	} else {
+		std::cout << "Test OK\n";
+	}
+
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/performance/spmv.cpp b/tests/performance/spmv.cpp
new file mode 100644
index 000000000..a8ec784b9
--- /dev/null
+++ b/tests/performance/spmv.cpp
@@ -0,0 +1,340 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * @authors Anders Hansson, Aristeidis Mastoras, A. N. Yzelman
+ * @date May, 2022
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+struct input {
+	char filename[ 1024 ];
+	bool direct;
+	size_t rep;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	grb::utils::TimerResults times;
+	PinnedVector< double > pinnedVector;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	// create local parser
+	grb::utils::MatrixFileReader< double,
+		std::conditional< (sizeof( grb::config::RowIndexType) >
+				sizeof(grb::config::ColIndexType )),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	const size_t m = parser.m();
+	const size_t n = parser.n();
+
+	out.times.io = timer.time();
+	timer.reset();
+
+	// load into GraphBLAS
+	Matrix< double > A( m, n );
+	{
+		const RC rc = buildMatrixUnique(
+			A,
+			parser.begin( SEQUENTIAL ), parser.end( SEQUENTIAL ),
+			SEQUENTIAL
+		);
+		/* Once internal issue #342 is resolved this can be re-enabled
+		const RC rc = buildMatrixUnique( A,
+			parser.begin( PARALLEL ), parser.end( PARALLEL),
+			PARALLEL
+		);*/
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed "
+				<< "(" << toString( rc ) << ")." << std::endl;
+			return;
+		}
+	}
+
+	// check number of nonzeroes
+	try {
+		const size_t global_nnz = nnz( A );
+		const size_t parser_nnz = parser.nz();
+		if( global_nnz != parser_nnz ) {
+			std::cerr << "Failure: global nnz (" << global_nnz << ") does not equal "
+				<< "parser nnz (" << parser_nnz << ")." << std::endl;
+			return;
+		}
+	} catch( const std::runtime_error & ) {
+		std::cout << "Info: nonzero check skipped as the number of nonzeroes "
+			<< "cannot be derived from the matrix file header. The "
+			<< "grb::Matrix reports " << nnz( A ) << " nonzeroes.\n";
+	}
+
+	RC rc = SUCCESS;
+
+	Vector< double > y( m ), x( n );
+
+	const Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> ring;
+
+	rc = rc ? rc : set( x, static_cast< double >( 1 ) );
+	assert( rc == SUCCESS );
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	// time a single call
+	{
+		grb::utils::Timer subtimer;
+		subtimer.reset();
+
+		rc = rc ? rc : set( y, static_cast< double >( 0 ) );
+		assert( rc == SUCCESS );
+
+		rc = rc ? rc : mxv( y, A, x, ring );
+		assert( rc == SUCCESS );
+
+		double single_time = subtimer.time();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to mxv did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		const size_t recommended_inner_repetitions =
+			static_cast< size_t >( 100.0 / single_time ) + 1;
+		if( rc == SUCCESS && out.rep == 0 ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold mxv completed"
+					<< ". Time taken was " << single_time << " ms. "
+					<< "Deduced inner repetitions parameter of " << out.rep << " "
+					<< "to take 100 ms. or more per inner benchmark.\n";
+				out.rep = recommended_inner_repetitions;
+			}
+			return;
+		}
+	}
+
+	// that was the preamble
+	out.times.preamble = timer.time();
+
+	// now do benchmark
+	double time_taken;
+	timer.reset();
+	for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+#ifndef NDEBUG
+		rc = rc ? rc : set( y, static_cast< double >( 0 ) );
+		assert( rc == SUCCESS );
+		rc = rc ? rc : mxv( y, A, x, ring );
+		assert( rc == SUCCESS );
+#else
+		(void) set( y, static_cast< double >( 0 ) );
+		(void) mxv( y, A, x, ring );
+#endif
+	}
+	time_taken = timer.time();
+	if( rc == SUCCESS ) {
+		out.times.useful = time_taken / static_cast< double >( out.rep );
+	}
+	// print timing at root process
+	if( grb::spmd<>::pid() == 0 ) {
+		std::cout << "Time taken for a " << out.rep << " "
+			<< "Mxv calls (hot start): " << out.times.useful << ". "
+			<< "Error code is " << out.error_code << std::endl;
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// output
+	out.pinnedVector = PinnedVector< double >( y, SEQUENTIAL );
+
+	// finish timing
+	time_taken = timer.time();
+	out.times.postamble = time_taken;
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 7 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " <dataset> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (verification <truth-file>)\n";
+		std::cout << "<dataset> and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer() << ". "
+			<< "This value must be strictly larger than 0." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+#ifndef NDEBUG
+	std::cerr << "Warning: this benchmark utility was **not** compiled with the "
+		<< "NDEBUG macro defined(!)\n";
+#endif
+
+	// the input struct
+	struct input in;
+
+	// get file name
+	(void) strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc >= 4 ) {
+		in.rep = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 2 ] << " "
+				<< "for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 5 ) {
+		outer = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ] << " "
+				<< "for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.rep << ", and outer reptitions = " << outer
+		<< std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< (int)rc << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	}
+
+	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Size of x is " << out.pinnedVector.size() << ".\n";
+	if( out.error_code == 0 && out.pinnedVector.nonzeroes() > 0 ) {
+		std::cerr << std::fixed;
+		std::cerr << "Output vector: (";
+		for( size_t k = 0; k < out.pinnedVector.nonzeroes(); ++k ) {
+			const auto &nonzeroValue = out.pinnedVector.getNonzeroValue( k );
+			std::cerr << nonzeroValue << " ";
+		}
+		std::cerr << ")" << std::endl;
+		std::cerr << std::defaultfloat;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cerr << "Test FAILED\n";
+	} else {
+		std::cout << "Test OK\n";
+	}
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/smoke/CMakeLists.txt b/tests/smoke/CMakeLists.txt
index fac6ceee8..0e51d164e 100644
--- a/tests/smoke/CMakeLists.txt
+++ b/tests/smoke/CMakeLists.txt
@@ -57,7 +57,7 @@ add_grb_executables( manual_hook_grb_collectives_blas1_raw manual_launcher.cpp
 
 add_grb_executables( small_knn ../unit/auto_launcher.cpp
 	hook/knn.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( manual_hook_small_knn manual_launcher.cpp
@@ -94,53 +94,79 @@ add_grb_executables( from_mpi_launch_simple_pagerank_broadcast simple_pagerank_b
 )
 
 add_grb_executables( knn knn.cpp ../unit/parser.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( hpcg hpcg.cpp
-
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils
 )
 
 add_grb_executables( graphchallenge_nn_single_inference graphchallenge_nn_single_inference.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( simple_pagerank simple_pagerank.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
+add_grb_executables( pregel_pagerank_local pregel_pagerank.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+	COMPILE_DEFINITIONS PR_CONVERGENCE_MODE=true
+)
+
+add_grb_executables( pregel_pagerank_global pregel_pagerank.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+	COMPILE_DEFINITIONS PR_CONVERGENCE_MODE=false
+)
+
+add_grb_executables( pregel_connected_components pregel_connected_components.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
 add_grb_executables( conjugate_gradient conjugate_gradient.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( conjugate_gradient_complex conjugate_gradient.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 	COMPILE_DEFINITIONS _CG_COMPLEX
 )
 
 add_grb_executables( bicgstab bicgstab.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
 add_grb_executables( kmeans kmeans.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( labeltest label_test.cpp
 	../unit/parser.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( small_pagerank ../unit/auto_launcher.cpp
 	hook/small_simple_pagerank.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
+add_grb_executables( kcore_decomposition_critical kcore_decomposition.cpp
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+	BACKENDS reference reference_omp
+	COMPILE_DEFINITIONS KCORE_VARIANT=true
+)
+
+add_grb_executables( kcore_decomposition kcore_decomposition.cpp
+	ADDITIONAL_LINK_LIBRARIES test_utils_headers
+	BACKENDS reference reference_omp hyperdags nonblocking bsp1d hybrid
 )
 
 # targets to list and build the test for this category
diff --git a/tests/smoke/kcore_decomposition.cpp b/tests/smoke/kcore_decomposition.cpp
new file mode 100644
index 000000000..7679840e3
--- /dev/null
+++ b/tests/smoke/kcore_decomposition.cpp
@@ -0,0 +1,365 @@
+
+/*
+ *   Copyright 2022 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas.hpp>
+#include <graphblas/algorithms/kcore_decomposition.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+#include <utils/output_verification.hpp>
+
+
+using namespace grb;
+using namespace algorithms;
+
+struct input {
+	char filename[ 1024 ];
+	bool direct;
+	size_t rep;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	size_t k;
+	grb::utils::TimerResults times;
+	grb::PinnedVector< int > pinnedVector;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	grb::utils::MatrixFileReader< void,
+		std::conditional< (
+				sizeof( grb::config::RowIndexType ) > sizeof( grb::config::ColIndexType )
+			), grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	assert( parser.m() == parser.n() );
+	const size_t n = parser.n();
+	out.times.io = timer.time();
+	timer.reset();
+
+	// load into GraphBLAS
+	grb::Matrix< void > L( n, n );
+	{
+		const grb::RC rc = buildMatrixUnique(
+			L,
+			parser.begin( grb::SEQUENTIAL ), parser.end( grb::SEQUENTIAL ),
+			grb::SEQUENTIAL
+		);
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Failure: call to buildMatrixUnique did not succeed ("
+				<< toString( rc ) << ")." << std::endl;
+			out.error_code = 10;
+			return;
+		}
+	}
+
+	// check number of nonzeroes
+	try {
+		const size_t global_nnz = nnz( L );
+		const size_t parser_nnz = parser.nz();
+		if( global_nnz != parser_nnz ) {
+			std::cerr << "Failure: global nnz (" << global_nnz << ") "
+				<< "does not equal parser nnz (" << parser_nnz << ")." << std::endl;
+			out.error_code = 15;
+			return;
+		}
+	} catch( const std::runtime_error & ) {
+		std::cout << "Info: nonzero check skipped as the number of nonzeroes "
+			<< "cannot be derived from the matrix file header. The "
+			<< "grb::Matrix reports "
+			<< nnz( L ) << " nonzeroes.\n";
+	}
+
+	grb::Vector< bool > st( n );
+	grb::Vector< int > d( n ), t( n ), u( n );
+	grb::Vector< int > core( n );
+	int k = 0;
+
+	out.times.preamble = timer.time();
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	RC rc = SUCCESS;
+	if( out.rep == 0 ) {
+		timer.reset();
+#ifdef KCORE_VARIANT
+		rc = kcore_decomposition<
+				grb::descriptors::no_operation,
+				KCORE_VARIANT
+			>( L, core, d, t, u, st, k );
+#else
+		rc = kcore_decomposition( L, core, d, t, u, st, k );
+#endif
+
+		double single_time = timer.time();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to kcore_decomposition did not succeed "
+				<< "(" << toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold k-core decomposition completed within "
+					<< k << " coreness levels. Time taken was " << single_time
+					<< " ms. Deduced inner repetitions parameter of " << out.rep
+					<< " to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		// do benchmark
+		double time_taken;
+		timer.reset();
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			if( rc == SUCCESS ) {
+#ifdef KCORE_VARIANT
+				rc = kcore_decomposition<
+						grb::descriptors::no_operation,
+						KCORE_VARIANT
+					>( L, core, d, t, u, st, k );
+#else
+				rc = kcore_decomposition( L, core, d, t, u, st, k );
+#endif
+			}
+		}
+		time_taken = timer.time();
+		if( rc == SUCCESS ) {
+			out.times.useful = time_taken / static_cast< double >( out.rep );
+		}
+		sleep( 1 );
+#ifndef NDEBUG
+		// print timing at root process
+		if( grb::spmd<>::pid() == 0 ) {
+			std::cout << "Time taken for a " << out.rep
+				<< " k-core decomposition calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << out.error_code << std::endl;
+		}
+#endif
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// output
+	out.pinnedVector = PinnedVector< int >( core, SEQUENTIAL );
+	out.k = k;
+
+	// finish timing
+	const double time_taken = timer.time();
+	out.times.postamble = time_taken;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 7 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " "
+			<< "<dataset> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (verification <truth-file>)\n";
+		std::cout << "<dataset> and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer()
+			<< ". This value must be strictly larger than 0.\n";
+		std::cout << "(verification <truth-file>) is optional. "
+			<< "The <truth-file> must point to a pre-computed solution that the "
+			<< "computed solution will be verified against." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// the input struct
+	struct input in;
+
+	// get file name
+	(void) strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc >= 4 ) {
+		in.rep = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 2 ]
+				<< " for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 5 ) {
+		outer = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ]
+				<< " for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	// check for verification of the output
+	bool verification = false;
+	char truth_filename[ 1024 ];
+	if( argc >= 6 ) {
+		if( strncmp( argv[ 5 ], "verification", 12 ) == 0 ) {
+			verification = true;
+			if( argc >= 7 ) {
+				(void) strncpy( truth_filename, argv[ 6 ], 1023 );
+				truth_filename[ 1023 ] = '\0';
+			} else {
+				std::cerr << "The verification file was not provided as an argument."
+					<< std::endl;
+				return 5;
+			}
+		} else {
+			std::cerr << "Could not parse argument \"" << argv[ 5 ] << "\", "
+				<< "the optional \"verification\" argument was expected." << std::endl;
+			return 5;
+		}
+	}
+
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.rep << ", "
+		<< "and outer reptitions = " << outer << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< (int)rc << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	} else if( out.error_code == 0 ) {
+		std::cout << "Benchmark completed successfully and has "
+			<< out.k << " coreness levels.\n";
+	}
+
+	const size_t n = out.pinnedVector.size();
+	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Size of core is " << n << ".\n";
+	if( out.error_code == 0 && n > 0 ) {
+		std::cout << "First 10 nonzeroes of core are: ( ";
+		for( size_t k = 0; k < out.pinnedVector.nonzeroes() && k < 10; ++k ) {
+			const auto &value = out.pinnedVector.getNonzeroValue( k );
+			std::cout << value << " ";
+		}
+		std::cout << ")" << std::endl;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cout << "Test FAILED\n";
+	} else {
+		if( verification ) {
+			out.error_code = vector_verification(
+				out.pinnedVector, truth_filename,
+				1e-5, 1e-6
+			);
+			if( out.error_code == 0 ) {
+				std::cout << "Verification OK\n";
+				std::cout << "Test OK\n";
+			} else {
+				std::cerr << std::flush;
+				std::cout << "Verification FAILED\n";
+				std::cout << "Test FAILED\n";
+			}
+		} else {
+			std::cout << "Test OK\n";
+		}
+	}
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/smoke/label.cpp b/tests/smoke/label.cpp
index fb24c1625..f99c18d91 100644
--- a/tests/smoke/label.cpp
+++ b/tests/smoke/label.cpp
@@ -41,7 +41,10 @@ using namespace grb;
 constexpr size_t MaxPrinting = 10;
 
 // forward declaration of the graph dataset parser
-bool readEdges( std::string filename, bool use_indirect, size_t * n, size_t * nz, size_t ** I, size_t ** J, double ** weights );
+bool readEdges(
+	std::string filename, bool use_indirect,
+	size_t * n, size_t * nz, size_t ** I, size_t ** J, double ** weights
+);
 
 struct input {
 	char filename[ 1024 ];
@@ -186,10 +189,9 @@ int main( int argc, char ** argv ) {
 
 	// sanity check
 	if( argc < 3 || argc > 5 ) {
-		std::cout << "Usage: " << argv[ 0 ]
-				  << " <dataset> <direct/indirect> (number of inner "
-					 "iterations) (number of outer iterations)"
-				  << std::endl;
+		std::cout << "Usage: " << argv[ 0 ] << " <dataset> <direct/indirect> "
+			<< "(number of inner iterations) (number of outer iterations)"
+			<< std::endl;
 		return 0;
 	}
 	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
@@ -200,7 +202,7 @@ int main( int argc, char ** argv ) {
 		std::cerr << "Could not parse filename: too long." << std::endl;
 		return 10;
 	}
-	(void)strncpy( in.filename, argv[ 1 ], 1023 );
+	(void) strncpy( in.filename, argv[ 1 ], 1023 );
 	in.filename[ 1023 ] = '\0';
 	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
 		in.direct = true;
@@ -217,23 +219,24 @@ int main( int argc, char ** argv ) {
 	if( argc >= 4 ) {
 		inner = strtoumax( argv[ 3 ], &end, 10 );
 		if( argv[ 3 ] == end ) {
-			std::cerr << "Could not parse argument for number of inner "
-						 "repititions."
-					  << std::endl;
+			std::cerr << "Could not parse argument for number of inner repititions."
+				<< std::endl;
 			return 30;
 		}
 	}
 	if( argc >= 5 ) {
 		outer = strtoumax( argv[ 4 ], &end, 10 );
 		if( argv[ 4 ] == end ) {
-			std::cerr << "Could not parse argument for number of outer "
-						 "repititions."
-					  << std::endl;
+			std::cerr << "Could not parse argument for number of outer repititions."
+				<< std::endl;
 			return 40;
 		}
 	}
 
-	std::cout << "Executable called with parameters filename " << in.filename << ", direct = " << in.direct << ", and #vertices = " << in.n << std::endl;
+	std::cout << "Executable called with parameters "
+		<< "filename " << in.filename << ", "
+		<< "direct = " << in.direct << ", and "
+		<< "#vertices = " << in.n << std::endl;
 
 	// the output struct
 	struct output out;
@@ -242,7 +245,8 @@ int main( int argc, char ** argv ) {
 
 	enum grb::RC rc = launcher.exec( &grbProgram, in, out, inner, outer, true );
 	if( rc != SUCCESS ) {
-		std::cerr << "launcher.exec returns with non-SUCCESS error code " << (int)rc << std::endl;
+		std::cerr << "launcher.exec returns with non-SUCCESS error code "
+			<< toString(rc) << std::endl;
 		return 50;
 	}
 
@@ -250,9 +254,10 @@ int main( int argc, char ** argv ) {
 
 	// done
 	if( out.error_code != SUCCESS ) {
-		std::cout << "Test FAILED.\n\n";
+		std::cout << "Test FAILED\n\n";
 		return 255;
 	}
-	std::cout << "Test OK.\n\n";
+	std::cout << "Test OK\n\n";
 	return 0;
 }
+
diff --git a/tests/smoke/label_test.cpp b/tests/smoke/label_test.cpp
index f8374fdcd..246b31e53 100644
--- a/tests/smoke/label_test.cpp
+++ b/tests/smoke/label_test.cpp
@@ -34,6 +34,7 @@
 
 #include "graphblas.hpp"
 
+
 using namespace grb;
 
 static size_t n;  // total number of vertices
@@ -62,13 +63,15 @@ using namespace grb;
 bool init_input( const struct input & data_in ) {
 	// a binary tree with data_in.n vertices at the leaves
 	n = 2 * data_in.n - 1;
-	// construct the input labels: the first l (leaves) are clamped to 1 and the rest are 0
+	// construct the input labels: the first l (leaves) are clamped to 1 and the
+	// rest are 0
 	l = data_in.n;
 	labels = new double[ n ];
 	for( size_t i = 0; i < n; i++ )
 		labels[ i ] = ( i < l ) ? 1 : 0;
 
-	// there are n-1 edges in the tree and so 2(n-1) in the matrix, since it's a symmetric
+	// there are n-1 edges in the tree and so 2(n-1) in the matrix, since it's
+	// symmetric
 	nz = 2 * ( n - 1 );
 	weights = new double[ nz ];
 	I = new size_t[ nz ];
@@ -82,13 +85,11 @@ bool init_input( const struct input & data_in ) {
 		size_t dst = ( e & ~0x01 ) + pow( 2.0, ( levels - level ) ) - floor( edge / 2 );
 		I[ e ] = e;
 		J[ e ] = dst;
-		// std::cout << "e " << e << " level " << level << " edge " << edge << " edges " << edges << " I " << I[e] << " J " << J[e] << std::endl;
 		weights[ e ] = 1.0;
 		size_t e_other = e + ( nz / 2 );
 		I[ e_other ] = dst;
 		J[ e_other ] = e;
 		weights[ e_other ] = 1.0;
-		// std::cout << "e other " << e_other << " level " << level << " edge " << edge << " edges " << edges << " I " << I[e_other] << " J " << J[e_other] << std::endl;
 		edge++;
 		// update counters when we come to the end of the current tree level
 		if( edge == edges ) {
@@ -110,12 +111,14 @@ void free_input() {
 }
 
 // main label propagation algorithm
-void grbProgram( const struct input & data_in, struct output & out ) {
+void grbProgram( const struct input &data_in, struct output &out ) {
 
 	grb::utils::Timer timer;
 	timer.reset();
 	const size_t s = spmd<>::pid();
-	(void)s;
+#ifdef NDEBUG
+	(void) s;
+#endif
 	assert( s < spmd<>::nprocs() );
 
 	// get input n
@@ -129,12 +132,15 @@ void grbProgram( const struct input & data_in, struct output & out ) {
 	// create the intial set of l input labels in the vector y
 	Vector< double > y( n );
 	Vector< double > f( n );
-	buildVector( y, &( labels[ 0 ] ), &( labels[ 0 ] ) + n, SEQUENTIAL );
+	buildVector( y, &(labels[ 0 ]), &(labels[ 0 ]) + n, SEQUENTIAL );
 
 	// create the symmetric weight matrix W, representing the weighted graph
 	Matrix< double > W( n, n );
 	resize( W, nz );
-	RC rc = buildMatrixUnique( W, &( I[ 0 ] ), &( J[ 0 ] ), &( weights[ 0 ] ), nz, SEQUENTIAL );
+	RC rc = buildMatrixUnique(
+		W, &(I[ 0 ]), &(J[ 0 ]), &(weights[ 0 ]), nz,
+		SEQUENTIAL
+	);
 	if( rc != SUCCESS ) {
 		out.error_code = ILLEGAL;
 		free_input();
@@ -156,10 +162,9 @@ void grbProgram( const struct input & data_in, struct output & out ) {
 int main( int argc, char ** argv ) {
 	// sanity check
 	if( argc < 2 || argc > 4 ) {
-		std::cout << "Usage: " << argv[ 0 ]
-				  << " <number of vertices> (number of inner iterations) "
-					 "(number of outer iterations)"
-				  << std::endl;
+		std::cout << "Usage: " << argv[ 0 ] << " <number of vertices> "
+			<< "(number of inner iterations) (number of outer iterations)"
+			<< std::endl;
 		return 0;
 	}
 	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
@@ -167,7 +172,7 @@ int main( int argc, char ** argv ) {
 	// the input struct
 	struct input in;
 	in.n = atoi( argv[ 1 ] );
-	std::cout << "Executable called with parameters #vertices = " << in.n << std::endl;
+	std::cout << "Executable called with parameters #vertices = " << in.n << "\n";
 
 	// the output struct
 	struct output out;
@@ -176,17 +181,19 @@ int main( int argc, char ** argv ) {
 
 	enum grb::RC rc = launcher.exec( &grbProgram, in, out );
 	if( rc != SUCCESS ) {
-		std::cerr << "launcher.exec returns with non-SUCCESS error code " << (int)rc << std::endl;
+		std::cerr << "launcher.exec returns with non-SUCCESS error code "
+			<< toString(rc) << std::endl;
 		return 50;
 	}
 
-	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Error code is " << toString(out.error_code) << "\n";
 
 	// done
 	if( out.error_code != SUCCESS ) {
-		std::cout << "Test FAILED.\n\n";
+		std::cout << "Test FAILED\n\n";
 		return 1;
 	}
-	std::cout << "Test OK.\n\n";
+	std::cout << "Test OK\n\n";
 	return 0;
 }
+
diff --git a/tests/smoke/output_verification/kcore_decomposition_eda_ref b/tests/smoke/output_verification/kcore_decomposition_eda_ref
new file mode 100644
index 000000000..af22e2ece
--- /dev/null
+++ b/tests/smoke/output_verification/kcore_decomposition_eda_ref
@@ -0,0 +1,4772 @@
+1
+0
+1
+1
+0
+0
+1
+0
+1
+0
+2
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+0
+1
+0
+0
+0
+1
+0
+0
+1
+0
+1
+1
+1
+0
+1
+0
+0
+1
+0
+0
+1
+0
+1
+0
+0
+1
+0
+0
+0
+1
+1
+0
+0
+2
+0
+0
+0
+1
+0
+0
+0
+1
+0
+1
+1
+2
+1
+2
+2
+0
+0
+2
+0
+1
+1
+1
+1
+0
+1
+1
+0
+0
+2
+1
+1
+1
+0
+0
+1
+0
+0
+0
+1
+0
+1
+2
+1
+1
+1
+1
+1
+0
+0
+1
+1
+1
+0
+2
+1
+1
+0
+0
+1
+0
+0
+2
+2
+0
+0
+0
+1
+1
+1
+0
+0
+0
+1
+1
+0
+1
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+1
+0
+1
+1
+1
+1
+0
+2
+1
+1
+0
+1
+1
+1
+0
+1
+0
+1
+1
+0
+1
+1
+0
+0
+0
+2
+0
+0
+1
+0
+0
+0
+0
+0
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+0
+0
+0
+2
+1
+2
+0
+1
+1
+1
+1
+0
+1
+0
+0
+1
+1
+1
+1
+1
+1
+0
+1
+1
+0
+1
+0
+1
+1
+0
+1
+0
+1
+1
+1
+1
+0
+1
+0
+0
+2
+0
+0
+0
+0
+0
+0
+1
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+1
+0
+0
+1
+1
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+0
+2
+0
+0
+0
+1
+0
+0
+1
+1
+0
+0
+1
+2
+1
+2
+0
+0
+0
+1
+1
+1
+1
+0
+1
+0
+0
+1
+1
+1
+0
+1
+1
+1
+1
+1
+0
+2
+1
+2
+1
+0
+0
+1
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+2
+0
+0
+2
+1
+0
+0
+0
+2
+0
+0
+0
+0
+1
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+2
+1
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+1
+0
+1
+1
+1
+0
+1
+1
+1
+1
+1
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+1
+0
+1
+0
+1
+2
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+2
+2
+2
+1
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+1
+1
+2
+0
+0
+0
+0
+0
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+1
+0
+1
+1
+0
+1
+1
+1
+1
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+0
+0
+0
+1
+1
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+1
+0
+0
+1
+0
+0
+2
+2
+2
+2
+2
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+1
+0
+1
+0
+0
+1
+1
+1
+1
+1
+0
+2
+1
+0
+0
+0
+1
+1
+1
+0
+0
+0
+1
+0
+0
+1
+1
+1
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+1
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+0
+1
+1
+0
+1
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+1
+1
+1
+1
+0
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+0
+0
+1
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+2
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+1
+1
+1
+1
+0
+1
+0
+1
+1
+1
+1
+0
+1
+1
+0
+1
+1
+1
+1
+0
+1
+0
+1
+0
+0
+1
+1
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+2
+1
+2
+1
+1
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+0
+2
+0
+0
+0
+1
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+2
+1
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+2
+0
+0
+0
+0
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+1
+0
+1
+0
+1
+1
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+1
+0
+1
+1
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+2
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+1
+1
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+1
+0
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+0
+1
+1
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+1
+0
+0
+1
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+1
+0
+0
+0
+0
+1
+0
+0
+0
+1
diff --git a/tests/smoke/pregel_connected_components.cpp b/tests/smoke/pregel_connected_components.cpp
new file mode 100644
index 000000000..3d98b83d6
--- /dev/null
+++ b/tests/smoke/pregel_connected_components.cpp
@@ -0,0 +1,345 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/algorithms/pregel_connected_components.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+using namespace algorithms;
+
+struct input {
+	char filename[ 1024 ];
+	bool direct;
+	size_t rep;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	size_t iterations;
+	grb::utils::TimerResults times;
+	PinnedVector< size_t > pinnedVector;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	// create local parser
+	grb::utils::MatrixFileReader< void,
+		std::conditional<
+			(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	assert( parser.m() == parser.n() );
+	const size_t n = parser.n();
+	out.times.io = timer.time();
+	timer.reset();
+
+        // prepare Pregel interface
+        grb::interfaces::Pregel< void > pregel(
+		parser.n(), parser.m(),
+		parser.begin(), parser.end(),
+		SEQUENTIAL
+	);
+
+	// 1. initalise connected components IDs and message buffers
+        grb::Vector< size_t > cc( n );
+
+	out.times.preamble = timer.time();
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	grb::RC rc = grb::SUCCESS;
+	// time a single call
+	if( out.rep == 0 ) {
+		timer.reset();
+	        rc = grb::algorithms::pregel::ConnectedComponents< size_t >::execute(
+			pregel, cc, pregel.num_vertices() );
+		double single_time = timer.time();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to Pregel ConnectedAlgorithms did not succeed "
+				<< "(" << toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold connected components completed within "
+					<< out.iterations << " iterations. "
+					<< "Time taken was " << single_time
+					<< " ms. Deduced inner repetitions parameter of " << out.rep
+					<< " to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		// do benchmark
+		double time_taken;
+		timer.reset();
+		grb::Vector< size_t > in_msgs( n ), out_msgs( n );
+		grb::Vector< size_t > out_buffer = interfaces::config::out_sparsify
+			? grb::Vector< size_t >( n )
+			: grb::Vector< size_t >( 0 );
+		out.times.preamble += timer.time();
+		timer.reset();
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			if( rc == SUCCESS ) {
+				grb::set< grb::descriptors::use_index >( cc, 0 );
+				rc = pregel.template execute<
+					grb::operators::max< size_t >,
+					grb::identities::negative_infinity
+				>(
+					&(grb::algorithms::pregel::ConnectedComponents< size_t >::program),
+					cc,
+					grb::algorithms::pregel::ConnectedComponents< size_t >::Data(),
+					in_msgs, out_msgs,
+					out.iterations,
+					out_buffer
+				);
+			}
+		}
+		time_taken = timer.time();
+		if( rc == SUCCESS ) {
+			out.times.useful = time_taken / static_cast< double >( out.rep );
+		}
+		sleep( 1 );
+#ifndef NDEBUG
+		// print timing at root process
+		if( grb::spmd<>::pid() == 0 ) {
+			std::cout << "Time taken for a " << out.rep
+				<< " Connected Components calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << out.error_code << std::endl;
+		}
+#endif
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// output
+	out.pinnedVector = PinnedVector< size_t >( cc, SEQUENTIAL );
+
+	// finish timing
+	const double time_taken = timer.time();
+	out.times.postamble = time_taken;
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 7 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " "
+			<< "<dataset> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (verification <truth-file>)\n";
+		std::cout << "<dataset> and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer()
+			<< ". This value must be strictly larger than 0.\n";
+		std::cout << "(verification <truth-file>) is optional. "
+			<< "The <truth-file> must point to a pre-computed solution that the "
+			<< "computed solution will be verified against." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// the input struct
+	struct input in;
+
+	// get file name
+	(void) strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = NULL;
+	if( argc >= 4 ) {
+		in.rep = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 2 ]
+				<< " for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 5 ) {
+		outer = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ]
+				<< " for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	// check for verification of the output
+	bool verification = false;
+	char truth_filename[ 1024 ];
+	if( argc >= 6 ) {
+		if( strncmp( argv[ 5 ], "verification", 12 ) == 0 ) {
+			verification = true;
+			if( argc >= 7 ) {
+				(void) strncpy( truth_filename, argv[ 6 ], 1023 );
+				truth_filename[ 1023 ] = '\0';
+			} else {
+				std::cerr << "The verification file was not provided as an argument."
+					<< std::endl;
+				return 5;
+			}
+		} else {
+			std::cerr << "Could not parse argument \"" << argv[ 5 ] << "\", "
+				<< "the optional \"verification\" argument was expected." << std::endl;
+			return 5;
+		}
+	}
+
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.rep << ", "
+		<< "and outer reptitions = " << outer << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< grb::toString( rc ) << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	} else if( out.error_code == 0 ) {
+		std::cout << "Benchmark completed successfully and took "
+			<< out.iterations << " iterations to converge.\n";
+			//<< "with residual " << out.residual << ".\n";
+	}
+
+	const size_t n = out.pinnedVector.size();
+	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Size of pr is " << n << ".\n";
+	if( out.error_code == 0 && n > 0 ) {
+		std::cout << "First 10 nonzeroes of pr are: (\n";
+		for( size_t k = 0; k < out.pinnedVector.nonzeroes() && k < 10; ++k ) {
+			const auto &index = out.pinnedVector.getNonzeroIndex( k );
+			const auto &value = out.pinnedVector.getNonzeroValue( k );
+			std::cout << "\t " << index << ", " << value << "\n";
+		}
+		std::cout << ")" << std::endl;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cout << "Test FAILED\n";
+	} else {
+		if( verification ) {
+			std::cerr << "Verification is not yet implemented\n";
+			out.error_code = 255;
+			if( out.error_code == 0 ) {
+				std::cout << "Verification OK\n";
+				std::cout << "Test OK\n";
+			} else {
+				std::cerr << std::flush;
+				std::cout << "Verification FAILED\n";
+				std::cout << "Test FAILED\n";
+			}
+		} else {
+			std::cout << "Test OK\n";
+		}
+	}
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/smoke/pregel_pagerank.cpp b/tests/smoke/pregel_pagerank.cpp
new file mode 100644
index 000000000..5adc8f8ae
--- /dev/null
+++ b/tests/smoke/pregel_pagerank.cpp
@@ -0,0 +1,357 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <exception>
+#include <iostream>
+#include <vector>
+
+#include <inttypes.h>
+
+#include <graphblas/algorithms/pregel_pagerank.hpp>
+#include <graphblas/utils/Timer.hpp>
+#include <graphblas/utils/parser.hpp>
+
+#include <graphblas.hpp>
+#include <utils/output_verification.hpp>
+
+#ifndef PR_CONVERGENCE_MODE
+ #error "PR_CONVERGENCE_MODE must be defined and read either true or false"
+#endif
+
+
+using namespace grb;
+using namespace algorithms;
+
+struct input {
+	char filename[ 1024 ];
+	bool direct;
+	size_t rep;
+};
+
+struct output {
+	int error_code;
+	size_t rep;
+	size_t iterations;
+	//double residual;
+	grb::utils::TimerResults times;
+	PinnedVector< double > pinnedVector;
+};
+
+void grbProgram( const struct input &data_in, struct output &out ) {
+
+	// get user process ID
+	const size_t s = spmd<>::pid();
+	assert( s < spmd<>::nprocs() );
+
+	// get input n
+	grb::utils::Timer timer;
+	timer.reset();
+
+	// sanity checks on input
+	if( data_in.filename[ 0 ] == '\0' ) {
+		std::cerr << s << ": no file name given as input." << std::endl;
+		out.error_code = ILLEGAL;
+		return;
+	}
+
+	// assume successful run
+	out.error_code = 0;
+
+	// create local parser
+	grb::utils::MatrixFileReader< void,
+		std::conditional<
+			(sizeof(grb::config::RowIndexType) > sizeof(grb::config::ColIndexType)),
+			grb::config::RowIndexType,
+			grb::config::ColIndexType
+		>::type
+	> parser( data_in.filename, data_in.direct );
+	assert( parser.m() == parser.n() );
+	const size_t n = parser.n();
+	out.times.io = timer.time();
+	timer.reset();
+
+        // prepare Pregel interface
+        grb::interfaces::Pregel< void > pregel(
+		parser.n(), parser.m(),
+		parser.begin(), parser.end(),
+		SEQUENTIAL
+	);
+
+        // prepare for launching pagerank algorithm
+	// 1. initalise pagerank scores and message buffers
+        grb::Vector< double > pr( n ), in_msgs( n ), out_msgs( n );
+	grb::Vector < double > out_buffer = interfaces::config::out_sparsify
+		? grb::Vector< double >( n )
+		: grb::Vector< double >( 0 );
+	// 2. initialise PageRank parameters
+	grb::algorithms::pregel::PageRank< double, PR_CONVERGENCE_MODE >::Data pr_data;
+	// 3. get handle to Pregel PageRank program
+	auto &pr_prog = grb::algorithms::pregel::PageRank<
+			double, PR_CONVERGENCE_MODE
+		>::program;
+
+	out.times.preamble = timer.time();
+
+	// by default, copy input requested repetitions to output repititions performed
+	out.rep = data_in.rep;
+
+	// time a single call
+	RC rc = set( pr, 0 );
+	if( out.rep == 0 ) {
+		timer.reset();
+	        rc = pregel.template execute<
+				grb::operators::add< double >,
+				grb::identities::zero
+			> (
+				pr_prog, pr, pr_data,
+				in_msgs, out_msgs,
+				out.iterations,
+				out_buffer
+		        );
+		double single_time = timer.time();
+		if( rc != SUCCESS ) {
+			std::cerr << "Failure: call to pregel_pagerank did not succeed "
+				<< "(" << toString( rc ) << ")." << std::endl;
+			out.error_code = 20;
+		}
+		if( rc == SUCCESS ) {
+			rc = collectives<>::reduce( single_time, 0, operators::max< double >() );
+		}
+		if( rc != SUCCESS ) {
+			out.error_code = 25;
+		}
+		out.times.useful = single_time;
+		out.rep = static_cast< size_t >( 1000.0 / single_time ) + 1;
+		if( rc == SUCCESS ) {
+			if( s == 0 ) {
+				std::cout << "Info: cold pagerank completed within "
+					<< out.iterations << " iterations. "
+					//<< "Last computed residual is " << out.residual << ". "
+					<< "Time taken was " << single_time
+					<< " ms. Deduced inner repetitions parameter of " << out.rep
+					<< " to take 1 second or more per inner benchmark.\n";
+			}
+		}
+	} else {
+		// do benchmark
+		double time_taken;
+		timer.reset();
+		for( size_t i = 0; i < out.rep && rc == SUCCESS; ++i ) {
+			rc = grb::set( pr, 0 );
+			if( rc == SUCCESS ) {
+				rc = algorithms::pregel::PageRank< double, PR_CONVERGENCE_MODE >::execute(
+						pregel, pr, out.iterations, pr_data
+					);
+			}
+		}
+		time_taken = timer.time();
+		if( rc == SUCCESS ) {
+			out.times.useful = time_taken / static_cast< double >( out.rep );
+		}
+		sleep( 1 );
+#ifndef NDEBUG
+		// print timing at root process
+		if( grb::spmd<>::pid() == 0 ) {
+			std::cout << "Time taken for a " << out.rep
+				<< " PageRank calls (hot start): " << out.times.useful << ". "
+				<< "Error code is " << out.error_code << std::endl;
+		}
+#endif
+	}
+
+	// start postamble
+	timer.reset();
+
+	// set error code
+	if( rc == FAILED ) {
+		out.error_code = 30;
+		// no convergence, but will print output
+	} else if( rc != SUCCESS ) {
+		std::cerr << "Benchmark run returned error: " << toString( rc ) << "\n";
+		out.error_code = 35;
+		return;
+	}
+
+	// output
+	out.pinnedVector = PinnedVector< double >( pr, SEQUENTIAL );
+
+	// finish timing
+	const double time_taken = timer.time();
+	out.times.postamble = time_taken;
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// sanity check
+	if( argc < 3 || argc > 7 ) {
+		std::cout << "Usage: " << argv[ 0 ] << " "
+			<< "<dataset> <direct/indirect> "
+			<< "(inner iterations) (outer iterations) (verification <truth-file>)\n";
+		std::cout << "<dataset> and <direct/indirect> are mandatory arguments.\n";
+		std::cout << "(inner iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::inner() << ". "
+			<< "If set to zero, the program will select a number of iterations "
+			<< "approximately required to take at least one second to complete.\n";
+		std::cout << "(outer iterations) is optional, the default is "
+			<< grb::config::BENCHMARKING::outer()
+			<< ". This value must be strictly larger than 0.\n";
+		std::cout << "(verification <truth-file>) is optional. "
+			<< "The <truth-file> must point to a pre-computed solution that the "
+			<< "computed solution will be verified against." << std::endl;
+		return 0;
+	}
+	std::cout << "Test executable: " << argv[ 0 ] << std::endl;
+
+	// the input struct
+	struct input in;
+
+	// get file name
+	(void) strncpy( in.filename, argv[ 1 ], 1023 );
+	in.filename[ 1023 ] = '\0';
+
+	// get direct or indirect addressing
+	if( strncmp( argv[ 2 ], "direct", 6 ) == 0 ) {
+		in.direct = true;
+	} else {
+		in.direct = false;
+	}
+
+	// get inner number of iterations
+	in.rep = grb::config::BENCHMARKING::inner();
+	char * end = nullptr;
+	if( argc >= 4 ) {
+		in.rep = strtoumax( argv[ 3 ], &end, 10 );
+		if( argv[ 3 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 2 ]
+				<< " for number of inner experiment repititions." << std::endl;
+			return 2;
+		}
+	}
+
+	// get outer number of iterations
+	size_t outer = grb::config::BENCHMARKING::outer();
+	if( argc >= 5 ) {
+		outer = strtoumax( argv[ 4 ], &end, 10 );
+		if( argv[ 4 ] == end ) {
+			std::cerr << "Could not parse argument " << argv[ 3 ]
+				<< " for number of outer experiment repititions." << std::endl;
+			return 4;
+		}
+	}
+
+	// check for verification of the output
+	bool verification = false;
+	char truth_filename[ 1024 ];
+	if( argc >= 6 ) {
+		if( strncmp( argv[ 5 ], "verification", 12 ) == 0 ) {
+			verification = true;
+			if( argc >= 7 ) {
+				(void) strncpy( truth_filename, argv[ 6 ], 1023 );
+				truth_filename[ 1023 ] = '\0';
+			} else {
+				std::cerr << "The verification file was not provided as an argument."
+					<< std::endl;
+				return 5;
+			}
+		} else {
+			std::cerr << "Could not parse argument \"" << argv[ 5 ] << "\", "
+				<< "the optional \"verification\" argument was expected." << std::endl;
+			return 5;
+		}
+	}
+
+	std::cout << "Executable called with parameters " << in.filename << ", "
+		<< "inner repititions = " << in.rep << ", "
+		<< "and outer reptitions = " << outer << std::endl;
+
+	// the output struct
+	struct output out;
+
+	// set standard exit code
+	grb::RC rc = SUCCESS;
+
+	// launch estimator (if requested)
+	if( in.rep == 0 ) {
+		grb::Launcher< AUTOMATIC > launcher;
+		rc = launcher.exec( &grbProgram, in, out, true );
+		if( rc == SUCCESS ) {
+			in.rep = out.rep;
+		}
+		if( rc != SUCCESS ) {
+			std::cerr << "launcher.exec returns with non-SUCCESS error code "
+				<< grb::toString( rc ) << std::endl;
+			return 6;
+		}
+	}
+
+	// launch benchmark
+	if( rc == SUCCESS ) {
+		grb::Benchmarker< AUTOMATIC > benchmarker;
+		rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "benchmarker.exec returns with non-SUCCESS error code "
+			<< grb::toString( rc ) << std::endl;
+		return 8;
+	} else if( out.error_code == 0 ) {
+		std::cout << "Benchmark completed successfully and took "
+			<< out.iterations << " iterations to converge.\n";
+			//<< "with residual " << out.residual << ".\n";
+	}
+
+	const size_t n = out.pinnedVector.size();
+	std::cout << "Error code is " << out.error_code << ".\n";
+	std::cout << "Size of pr is " << n << ".\n";
+	if( out.error_code == 0 && n > 0 ) {
+		std::cout << "First 10 nonzeroes of pr are: (\n";
+		for( size_t k = 0; k < out.pinnedVector.nonzeroes() && k < 10; ++k ) {
+			const auto &index = out.pinnedVector.getNonzeroIndex( k );
+			const auto &value = out.pinnedVector.getNonzeroValue( k );
+			std::cout << "\t " << index << ", " << value << "\n";
+		}
+		std::cout << ")" << std::endl;
+	}
+
+	if( out.error_code != 0 ) {
+		std::cerr << std::flush;
+		std::cout << "Test FAILED\n";
+	} else {
+		if( verification ) {
+			out.error_code = vector_verification(
+				out.pinnedVector, truth_filename,
+				1e-5, 1e-6
+			);
+			if( out.error_code == 0 ) {
+				std::cout << "Verification OK\n";
+				std::cout << "Test OK\n";
+			} else {
+				std::cerr << std::flush;
+				std::cout << "Verification FAILED\n";
+				std::cout << "Test FAILED\n";
+			}
+		} else {
+			std::cout << "Test OK\n";
+		}
+	}
+	std::cout << std::endl;
+
+	// done
+	return out.error_code;
+}
+
diff --git a/tests/smoke/smoketests.sh b/tests/smoke/smoketests.sh
index 67b420ddb..4b134dd06 100755
--- a/tests/smoke/smoketests.sh
+++ b/tests/smoke/smoketests.sh
@@ -69,7 +69,7 @@ for BACKEND in ${BACKENDS[@]}; do
 	else
 		Ps=( 1 )
 	fi
-	if [ "$BACKEND" = "reference_omp" ] ; then
+	if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "nonblocking" ]; then
 		Pt=( ${MAX_THREADS} )
 	elif [ "$BACKEND" = "hybrid" ]; then
 		MTDS=$((MAX_THREADS/2))
@@ -94,7 +94,7 @@ for BACKEND in ${BACKENDS[@]}; do
 			elif [ "${BACKEND}" = "hybrid" ]; then
 				runner="${runner} ${MPI_PASS_ENV} ${LPFRUN_PASSTHROUGH}OMP_NUM_THREADS=${T}"
 				runner="${runner} ${BIND_PROCESSES_TO_MULTIPLE_HW_THREADS}${T}"
-			elif [ "$BACKEND" = "reference_omp" ]; then
+			elif [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "nonblocking" ]; then
 				export OMP_NUM_THREADS=${T}
 			fi
 
@@ -126,7 +126,7 @@ for BACKEND in ${BACKENDS[@]}; do
 				$runner ${TEST_BIN_DIR}/knn_${BACKEND} 4 ${INPUT_DIR}/facebook_combined.txt direct 1 1 &> ${TEST_OUT_DIR}/knn_${BACKEND}_${P}_${T}_facebook.log
 				head -1 ${TEST_OUT_DIR}/knn_${BACKEND}_${P}_${T}_facebook.log
 				if grep -q "Test OK" ${TEST_OUT_DIR}/knn_${BACKEND}_${P}_${T}_facebook.log; then
-					(grep -q "Neighbourhood size is 499" ${TEST_OUT_DIR}/knn_${BACKEND}_${P}_${T}_facebook.log && printf "Test OK\n\n") || (printf "Test FAILED (verification error)\n")
+					(grep -q "Neighbourhood size is 421" ${TEST_OUT_DIR}/knn_${BACKEND}_${P}_${T}_facebook.log && printf "Test OK\n\n") || (printf "Test FAILED (verification error)\n")
 				else
 					printf "Test FAILED\n"
 				fi
@@ -135,10 +135,14 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
-			echo ">>>      [x]           [ ]       Tests HPCG on a small matrix"
-			bash -c "$runner ${TEST_BIN_DIR}/hpcg_${BACKEND} &> ${TEST_OUT_DIR}/hpcg_${BACKEND}_${P}_${T}.log"
-			head -1 ${TEST_OUT_DIR}/hpcg_${BACKEND}_${P}_${T}.log
-			grep 'Test OK' ${TEST_OUT_DIR}/hpcg_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+			if [ "${GITHUB_ACTIONS}" = true ] && [ "${BACKEND}" = "hyperdags" ]; then
+				echo "Test DISABLED; GitHub runner does not have enough memory for this test"
+			else
+				echo ">>>      [x]           [ ]       Tests HPCG on a small matrix"
+				echo "Functional test executable: ${TEST_BIN_DIR}/hpcg_${BACKEND}"
+				bash -c "$runner ${TEST_BIN_DIR}/hpcg_${BACKEND} 2>&1 | sed -e '1p' -e '/===/!d' > ${TEST_OUT_DIR}/hpcg_${BACKEND}_${P}_${T}.log"
+				grep 'Test OK' ${TEST_OUT_DIR}/hpcg_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+			fi
 			echo " "
 
 			echo ">>>      [x]           [ ]       Tests an automatically launching version of the simple pagerank"
@@ -194,11 +198,25 @@ for BACKEND in ${BACKENDS[@]}; do
 			fi
 			echo " "
 
+			echo ">>>      [x]           [ ]       Tests grb::Launcher on a K-core decomposition on the dataset"
+			echo "                                 EPA.mtx. The launcher is used in automatic mode and the I/O"
+			echo "                                 mode is sequential. The Launcher::exec called is with struct"
+			echo "                                 I/O with broadcast true. This launches the default k-core"
+			echo "                                 variant."
+			echo "Functional test executable: ${TEST_BIN_DIR}/kcore_decomposition_${BACKEND}"
+			if [ -f ${INPUT_DIR}/EPA.mtx ]; then
+				$runner ${TEST_BIN_DIR}/kcore_decomposition_${BACKEND} ${INPUT_DIR}/EPA.mtx direct 1 1 verification ${OUTPUT_VERIFICATION_DIR}/kcore_decomposition_eda_ref &> ${TEST_OUT_DIR}/kcore_decomposition_${BACKEND}_EPA_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/kcore_decomposition_${BACKEND}_EPA_${P}_${T}.log || printf 'Test FAILED.\n'
+			else
+				echo "Test DISABLED; dataset not found. Provide EPA.mtx in the ./datasets/ directory to enable."
+			fi
+			echo " "
+
 			TESTNAME=rndHermit256
 			if [ -f ${TEST_DATA_DIR}/${TESTNAME}.mtx ]; then
 				n=$(grep -v '^%' ${TEST_DATA_DIR}/${TESTNAME}.mtx | head -1 | awk '{print $1}' )
 				m=$(grep -v '^%' ${TEST_DATA_DIR}/${TESTNAME}.mtx | head -1 | awk '{print $2}' )
-				echo ">>>      [x]           [ ]       Testing the conjugate gradient complex  algorithm for the input"
+				echo ">>>      [x]           [ ]       Testing the conjugate gradient complex algorithm for the input"
 				echo "                                 matrix (${n}x${m}) taken from ${TESTNAME}.mtx. This test"
 				echo "                                 verifies against a ground-truth solution vector. The test"
 				echo "                                 employs the grb::Launcher in automatic mode. It uses"
@@ -210,7 +228,7 @@ for BACKEND in ${BACKENDS[@]}; do
 				echo "Test DISABLED: ${TESTNAME}.mtx was not found. To enable, please provide ${TEST_DATA_DIR}/${TESTNAME}.mtx"
 			fi
 			echo " "
-			
+
 			echo ">>>      [x]           [ ]       Testing the BiCGstab algorithm for the 17361 x 17361 input"
 			echo "                                 matrix gyro_m.mtx. This test verifies against a ground-"
 			echo "                                 truth solution vector, the same as used for the earlier"
@@ -264,6 +282,62 @@ for BACKEND in ${BACKENDS[@]}; do
 				echo " "
 			done
 
+			echo ">>>      [x]           [ ]       Testing the Pregel PageRank-like algorithm using a global"
+			echo "                                 stopping criterion. Verifies via a simple regression test in"
+			echo "                                 number of rounds required."
+			if [ -f ${INPUT_DIR}/west0497.mtx ]; then
+				$runner ${TEST_BIN_DIR}/pregel_pagerank_global_${BACKEND} ${INPUT_DIR}/west0497.mtx direct 1 1 &> ${TEST_OUT_DIR}/pregel_pagerank_global_west0497_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/pregel_pagerank_global_west0497_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/pregel_pagerank_global_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
+				elif ! grep -q '56 iterations to converge' ${TEST_OUT_DIR}/pregel_pagerank_global_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Verification FAILED"
+					echo "Test FAILED"
+				else
+					echo "Test OK"
+				fi
+			else
+				echo "Test DISABLED: west0497.mtx was not found. To enable, please provide ${INPUT_DIR}/west0497.mtx"
+			fi
+			echo " "
+
+			echo ">>>      [x]           [ ]       Testing the Pregel PageRank-like algorithm using a vertex-local"
+			echo "                                 stopping criterion. Verifies via a simple regression test in"
+			echo "                                 number of rounds required."
+			if [ -f ${INPUT_DIR}/west0497.mtx ]; then
+				$runner ${TEST_BIN_DIR}/pregel_pagerank_local_${BACKEND} ${INPUT_DIR}/west0497.mtx direct 1 1 &> ${TEST_OUT_DIR}/pregel_pagerank_local_west0497_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/pregel_pagerank_local_west0497_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/pregel_pagerank_local_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
+				elif ! grep -q '47 iterations to converge' ${TEST_OUT_DIR}/pregel_pagerank_local_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Verification FAILED"
+					echo "Test FAILED"
+				else
+					echo "Test OK"
+				fi
+			else
+				echo "Test DISABLED: west0497.mtx was not found. To enable, please provide ${INPUT_DIR}/west0497.mtx"
+			fi
+			echo " "
+
+			echo ">>>      [x]           [ ]       Testing the Pregel connected components algorithm. Verifies"
+			echo "                                 using a simple regression test in number of rounds required."
+			if [ -f ${INPUT_DIR}/west0497.mtx ]; then
+				$runner ${TEST_BIN_DIR}/pregel_connected_components_${BACKEND} ${INPUT_DIR}/west0497.mtx direct 1 1 &> ${TEST_OUT_DIR}/pregel_connected_components_west0497_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/pregel_connected_components_west0497_${BACKEND}_${P}_${T}.log
+				if ! grep -q 'Test OK' ${TEST_OUT_DIR}/pregel_connected_components_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Test FAILED"
+				elif ! grep -q '11 iterations to converge' ${TEST_OUT_DIR}/pregel_connected_components_west0497_${BACKEND}_${P}_${T}.log; then
+					echo "Verification FAILED"
+					echo "Test FAILED"
+				else
+					echo "Test OK"
+				fi
+			else
+				echo "Test DISABLED: west0497.mtx was not found. To enable, please provide ${INPUT_DIR}/west0497.mtx"
+			fi
+			echo " "
+
 			if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
 				echo "Additional standardised smoke tests not yet supported for the ${BACKEND} backend"
 				echo
@@ -276,6 +350,26 @@ for BACKEND in ${BACKENDS[@]}; do
 			tail -1 ${TEST_OUT_DIR}/kmeans_${BACKEND}_${P}_${T}.log
 			echo " "
 
+			if [ "$BACKEND" = "reference_omp" ] || [ "$BACKEND" = "reference" ]; then
+				echo "Non-standard reference- and reference-omp specific smoke tests:"
+				echo " "
+				echo ">>>      [x]           [ ]       Tests grb::Launcher on a K-core decomposition on the dataset"
+				echo "                                 EPA.mtx. The launcher is used in automatic mode and the I/O"
+				echo "                                 mode is sequential. The Launcher::exec called is with struct"
+				echo "                                 I/O with broadcast true. This launches the k-core variant"
+				echo "                                 that employs critical sections. This is a non-ALP-compliant"
+				echo "                                 implementation that furthermore assumes an OpenMP-based"
+				echo "                                 backend."
+				echo "Functional test executable: ${TEST_BIN_DIR}/kcore_decomposition_critical_${BACKEND}"
+				if [ -f ${INPUT_DIR}/EPA.mtx ]; then
+					$runner ${TEST_BIN_DIR}/kcore_decomposition_critical_${BACKEND} ${INPUT_DIR}/EPA.mtx direct 1 1 verification ${OUTPUT_VERIFICATION_DIR}/kcore_decomposition_eda_ref &> ${TEST_OUT_DIR}/kcore_decomposition_critical_${BACKEND}_EPA_${P}_${T}.log
+					grep 'Test OK' ${TEST_OUT_DIR}/kcore_decomposition_critical_${BACKEND}_EPA_${P}_${T}.log || printf 'Test FAILED.\n'
+				else
+					echo "Test DISABLED; dataset not found. Provide EPA.mtx in the ./datasets/ directory to enable."
+				fi
+				echo " "
+			fi
+
 		done
 	done
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index f6f467ae7..697a72b74 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -34,19 +34,19 @@ add_grb_executables( add15m add15m.cpp
 )
 
 add_grb_executables( argmax argmax.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( argmin argmin.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( buildVector buildVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( clearMatrix clearMatrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( compareParserTest parser.cpp
@@ -55,11 +55,11 @@ add_grb_executables( compareParserTest parser.cpp
 )
 
 add_grb_executables( copyAndAssignVectorIterator copyAndAssignVectorIterator.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( copyVector copyVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( distribution_bsp1d distribution_bsp1d.cpp
@@ -71,33 +71,33 @@ add_grb_executables( distribution distribution.cpp
 )
 
 add_grb_executables( id id.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( dot dot.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( emptyVector emptyVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( ewiseapply ewiseapply.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( eWiseMatrix eWiseMatrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( collectives_blas0 auto_launcher.cpp
 	launcher/collectives_blas0.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( fold_to_scalar auto_launcher.cpp
 	launcher/reduce.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( fork_launcher fork_launcher.cpp
@@ -110,24 +110,28 @@ add_grb_executable_custom( hpparser ${ALP_UTILS_SRC_PATH}/hpparser.c
 )
 
 add_grb_executables( masked_mxv masked_mxv.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( masked_vxm masked_vxm.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( matrixIterator matrixIterator.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils_headers
 )
 
+add_grb_executables( doubleAssign doubleAssign.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
 add_grb_executables( matrixSet matrixSet.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( moveVector moveVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp hyperdags bsp1d hybrid nonblocking
 )
 
 add_grb_executables( mul15i mul15i.cpp
@@ -138,12 +142,16 @@ add_grb_executables( mul15m mul15m.cpp
 	BACKENDS reference NO_BACKEND_NAME
 )
 
+add_grb_executables( eWiseMul eWiseMul.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
 add_grb_executables( muladd muladd.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( mxm mxm.cpp
-	BACKENDS reference reference_omp #bsp1d hybrid
+	BACKENDS reference reference_omp hyperdags nonblocking #bsp1d hybrid
 )
 
 add_grb_executables( parserTest utilParserTest.cpp
@@ -152,27 +160,27 @@ add_grb_executables( parserTest utilParserTest.cpp
 )
 
 add_grb_executables( RBGaussSeidel RBGaussSeidel.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( set set.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( sparse_mxv sparse_mxv.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( sparse_vxm sparse_vxm.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( stdVector stdVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( swapVector swapVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( thread_local_storage thread_local_storage.cpp
@@ -181,76 +189,87 @@ add_grb_executables( thread_local_storage thread_local_storage.cpp
 )
 
 add_grb_executables( vectorToMatrix vectorToMatrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( vmxa vmxa.cpp
-	BACKENDS reference reference_omp bsp1d
+	BACKENDS reference reference_omp bsp1d hyperdags nonblocking
 )
 
 add_grb_executables( vmx vmx.cpp
-	BACKENDS reference reference_omp bsp1d
+	BACKENDS reference reference_omp bsp1d hyperdags nonblocking
 )
 
 add_grb_executables( zip zip.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
+add_grb_executables( copyVoidMatrices copyVoidMatrices.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( masked_muladd masked_muladd.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( spy spy.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( dense_spmv dense_spmv.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( moveMatrix moveMatrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( stdMatrix stdMatrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( eWiseApply_matrix eWiseApply_matrix.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( eWiseApplyMatrixReference eWiseApplyMatrixReference.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( outer outer.cpp
-	BACKENDS reference reference_omp
+	BACKENDS reference reference_omp hyperdags nonblocking
 )
 
 add_grb_executables( mxv mxv.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( vxm vxm.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( capacity capacity.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( wait wait.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 add_grb_executables( buildMatrixUnique buildMatrixUnique.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 	ADDITIONAL_LINK_LIBRARIES test_utils
 )
 
 add_grb_executables( pinnedVector pinnedVector.cpp
-	BACKENDS reference reference_omp bsp1d hybrid
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
+)
+
+# the below targets test successfully when they compile -- they do not need to
+# be executed successfully as part of the unit test suite.
+
+add_grb_executables( properties static_asserts/properties.cpp
+	BACKENDS reference reference_omp bsp1d hybrid hyperdags nonblocking
 )
 
 # targets to list and build the test for this category
diff --git a/tests/unit/buildMatrixUnique.cpp b/tests/unit/buildMatrixUnique.cpp
index cf9fd5c05..ae2481e9c 100644
--- a/tests/unit/buildMatrixUnique.cpp
+++ b/tests/unit/buildMatrixUnique.cpp
@@ -201,7 +201,7 @@ void build_matrix_and_check(
  *
  * @tparam T matrix value type
  * @tparam IterT type of the input iterator, which MUST have a \a - operator and
- * 	static methods \a IterT::make_begin and \a IterT::make_end
+ *               static methods \a IterT::make_begin and \a IterT::make_end
  *
  * @tparam implementation ALP backend
  *
diff --git a/tests/unit/clearMatrix.cpp b/tests/unit/clearMatrix.cpp
index 983077d83..857036b0b 100644
--- a/tests/unit/clearMatrix.cpp
+++ b/tests/unit/clearMatrix.cpp
@@ -1,4 +1,3 @@
-
 /*
  *   Copyright 2021 Huawei Technologies Co., Ltd.
  *
@@ -22,6 +21,7 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
 void grb_program( const size_t &n, grb::RC &rc ) {
@@ -29,9 +29,12 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Vector< double > vector( n );
 	rc = grb::set< grb::descriptors::use_index >( vector, 0 );
 	if( rc == SUCCESS ) {
-		auto converter = grb::utils::makeVectorToMatrixConverter< double >( vector, []( const size_t &ind, const double &val ) {
-			return std::make_pair( std::make_pair( ind, ind ), val );
-		} );
+		auto converter = grb::utils::makeVectorToMatrixConverter< double >(
+			vector,
+			[]( const size_t &ind, const double &val ) {
+				return std::make_pair( std::make_pair( ind, ind ), val );
+			}
+		);
 		auto start = converter.begin();
 		auto end = converter.end();
 		rc = grb::buildMatrixUnique( diag, start, end, PARALLEL );
@@ -51,7 +54,8 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 	}
 
 	if( grb::nnz( diag ) != 0 ) {
-		std::cerr << "\t unexpected number of nonzeroes in matrix ( " << grb::nnz( diag ) << " ), expected 0\n";
+		std::cerr << "\t unexpected number of nonzeroes in matrix "
+			<< "( " << grb::nnz( diag ) << " ), expected 0\n";
 		rc = FAILED;
 	}
 
@@ -87,8 +91,8 @@ int main( int argc, char ** argv ) {
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
-		std::cerr << "  -n (optional, default is 100): an even integer, the "
-					 "test size.\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, "
+			<< "the test size.\n";
 		return 1;
 	}
 
@@ -96,13 +100,15 @@ int main( int argc, char ** argv ) {
 	grb::Launcher< AUTOMATIC > launcher;
 	grb::RC out;
 	if( launcher.exec( &grb_program, in, out, true ) != SUCCESS ) {
-		std::cerr << "Launching test FAILED\n";
+		std::cerr << "Launching test FAILED\n" << std::endl;
 		return 255;
 	}
 	if( out != SUCCESS ) {
-		std::cerr << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		std::cerr << std::flush;
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")\n" << std::endl;
 	} else {
-		std::cout << "Test OK" << std::endl;
+		std::cout << "Test OK\n" << std::endl;
 	}
 	return 0;
 }
+
diff --git a/tests/unit/copyVoidMatrices.cpp b/tests/unit/copyVoidMatrices.cpp
new file mode 100644
index 000000000..0f487fd1b
--- /dev/null
+++ b/tests/unit/copyVoidMatrices.cpp
@@ -0,0 +1,148 @@
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <graphblas/utils/iterators/MatrixVectorIterator.hpp>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+void grb_program( const size_t &n, grb::RC &rc ) {
+	grb::Matrix< void > A( n, n, n );
+
+	try {
+		grb::Matrix< void > C( A );
+	} catch( ... ) {
+		std::cerr << " Copying from empty void matrix failed!\n";
+		rc = FAILED;
+		return;
+	}
+
+	{
+		grb::Matrix< double > B( n, n, n );
+		grb::Vector< double > vector( n );
+		rc = grb::set< grb::descriptors::use_index >( vector, 0 );
+		if( rc == SUCCESS ) {
+			auto converter = grb::utils::makeVectorToMatrixConverter< double >(
+				vector,
+				[]( const size_t &ind, const double &val ) {
+					return std::make_pair( std::make_pair( ind, ind ), val );
+				}
+			);
+			auto start = converter.begin();
+			auto end = converter.end();
+			rc = grb::buildMatrixUnique( B, start, end, PARALLEL );
+		}
+		if( rc == SUCCESS ) {
+			rc = grb::set( A, B );
+		}
+		grb::Matrix< void > C( n, n, 0 );
+		if( rc == SUCCESS ) {
+			rc = grb::set( C, A, RESIZE );
+		}
+		if( rc == SUCCESS ) {
+			rc = grb::set( C, A );
+		}
+	}
+	if( rc != SUCCESS || grb::nnz( A ) != n ) {
+		std::cerr << "\t initialisation FAILED\n";
+		if( rc == SUCCESS ) {
+			rc = FAILED;
+		}
+		return;
+	}
+
+	try {
+		grb::Matrix< void > C( A );
+	} catch( ... ) {
+		std::cerr << " Copying from non-empty void matrix failed!\n";
+		rc = FAILED;
+		return;
+	}
+
+	rc = grb::clear( A );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t clear matrix FAILED\n";
+		return;
+	}
+
+	if( grb::nnz( A ) != 0 ) {
+		std::cerr << "\t unexpected number of nonzeroes in matrix "
+			<< "( " << grb::nnz( A ) << " ), expected 0\n";
+		rc = FAILED;
+	}
+
+	try {
+		grb::Matrix< void > C( A );
+	} catch( ... ) {
+		std::cerr << " Copying from cleared void matrix failed!\n";
+		rc = FAILED;
+		return;
+	}
+
+	// done
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an integer test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out;
+	if( launcher.exec( &grb_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n" << std::endl;
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << std::flush;
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")\n" << std::endl;
+	} else {
+		std::cout << "Test OK\n" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/dot.cpp b/tests/unit/dot.cpp
index 13d419f8f..4279109ce 100644
--- a/tests/unit/dot.cpp
+++ b/tests/unit/dot.cpp
@@ -20,6 +20,7 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
 void grb_program( const size_t &n, grb::RC &rc ) {
@@ -208,3 +209,4 @@ int main( int argc, char ** argv ) {
 	}
 	return 0;
 }
+
diff --git a/tests/unit/doubleAssign.cpp b/tests/unit/doubleAssign.cpp
new file mode 100644
index 000000000..1f3fd7aaa
--- /dev/null
+++ b/tests/unit/doubleAssign.cpp
@@ -0,0 +1,102 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+void grb_program( const size_t &n, grb::RC &rc ) {
+	assert( n > 0 );
+	{
+		Vector< double > a( n ), b( n );
+		rc = set( a, 1.2 );
+		rc = rc ? rc : set( b, 1.5 );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Warning: first subtest initialision FAILED\n";
+			return;
+		}
+		a = b;
+		a = b;
+	}
+#if 0 // enable if/when operator= is defined for grb::Matrix
+	if( n > 17 ) {
+		Matrix< void > A( n, n, 1 ), B( n, n, 1 );
+		size_t anInteger = 17;
+		const size_t * const start = &anInteger;
+		rc = buildMatrixUnique( A, start, start, 1, SEQUENTIAL );
+		anInteger = 7;
+		rc = rc ? rc : buildMatrixUnique( B, start, start, 1, SEQUENTIAL );
+		A = B;
+		A = B;
+	} else {
+		std::cerr << "Warning: part of the test is disabled-- "
+			<< "please choose a larger size n\n";
+	}
+#endif
+	return;
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( !(ss >> read) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( !ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the test "
+			<< "size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	grb::Launcher< AUTOMATIC > launcher;
+	grb::RC out;
+	if( launcher.exec( &grb_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/eWiseMatrix.cpp b/tests/unit/eWiseMatrix.cpp
index b3e21a2a1..c06a2d895 100644
--- a/tests/unit/eWiseMatrix.cpp
+++ b/tests/unit/eWiseMatrix.cpp
@@ -57,7 +57,7 @@ void grb_program( const size_t &n, grb::RC &rc ) {
 		return;
 	}
 	rc = grb::eWiseLambda(
-		[]( const size_t i, const size_t j, unsigned int & v ) {
+		[]( const size_t i, const size_t j, unsigned int &v ) {
 			(void)i;
 			(void)j;
 			v -= 1;
diff --git a/tests/unit/eWiseMul.cpp b/tests/unit/eWiseMul.cpp
new file mode 100644
index 000000000..c9900c508
--- /dev/null
+++ b/tests/unit/eWiseMul.cpp
@@ -0,0 +1,1148 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <graphblas.hpp>
+
+
+using namespace grb;
+
+void grb_program( const size_t &n, RC &rc ) {
+	// for the subtests that return ILLEGAL due to incorrect usage of the dense
+	// descriptor in the case of nonblocking execution, the ouput vector must be
+	// reset in order to cope with side effects
+	constexpr bool nonblocking_execution = Properties<>::isNonblockingExecution;
+
+	Semiring<
+		operators::add< double >, operators::mul< double >,
+		identities::zero, identities::one
+	> ring;
+
+	// repeatedly used containers
+	Vector< bool > even_mask( n ), odd_mask( n );
+	Vector< size_t > temp( n );
+	Vector< double > out( n ), left( n ), right( n );
+
+	// create masks
+	rc = set< descriptors::use_index >( temp, 0 );
+	rc = rc ? rc : eWiseLambda( [&temp] (const size_t i) {
+			if( temp[ i ] % 2 == 0 ) {
+				temp[ i ] = 1;
+			} else {
+				temp[ i ] = 0;
+			}
+		}, temp );
+	rc = rc ? rc : set( even_mask, temp, true );
+	rc = rc ? rc : set< descriptors::invert_mask >(
+		odd_mask, even_mask, true );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "\t initialisation of masks FAILED\n";
+		return;
+	}
+
+	// test eWiseMul on dense vectors
+	std::cout << "Test 1: ";
+	rc = rc ? rc : set( out, 0 );
+	rc = rc ? rc : set( left, 1 );
+	rc = rc ? rc : set( right, 2 );
+	rc = rc ? rc : eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.second != 2 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected entries with value 2 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test in-place behaviour of eWiseMul
+	std::cout << "\b\b 2: ";
+	rc = eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.second != 4 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected entries with value 4 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test in-place with dense descriptor
+	std::cout << "\b\b 3: ";
+	rc = eWiseMul< descriptors::dense >( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.second != 6 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected entries with value 4 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test illegal with dense descriptor
+	std::cout << "\b\b 4: ";
+	rc = clear( out );
+	rc = rc ? rc : eWiseMul< descriptors::dense >( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = clear( out );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( out )\n";
+			return;
+		}
+	}
+
+	std::cout << "\b\b 5: ";
+	rc = eWiseMul< descriptors::dense >( left, out, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = set( left, 1 );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::set( left, 1)\n";
+			return;
+		}
+	}
+
+	std::cout << "\b\b 6: ";
+	rc = eWiseMul< descriptors::dense >( left, right, out, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = set( left, 1 );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::set( left, 1)\n";
+			return;
+		}
+	}
+
+	std::cout << "\b\b 7: ";
+	rc = clear( left );
+	rc = rc ? rc : eWiseMul< descriptors::dense >( right, left, out, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = set( right, 2 );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::set( right, 2)\n";
+			return;
+		}
+	}
+
+	std::cout << "\b\b 8: ";
+	rc = eWiseMul< descriptors::dense >( left, right, out, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = set( left, 1 );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::set( left, 1)\n";
+			return;
+		}
+	}
+
+	std::cout << "\b\b 9: ";
+	rc = eWiseMul< descriptors::dense >( left, out, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != ILLEGAL ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected ILLEGAL\n";
+		rc = FAILED;
+	} else {
+		rc = SUCCESS;
+	}
+	if( rc != SUCCESS ) { return; }
+	if( nonblocking_execution ) {
+		rc = set( left, 1 );
+		if( rc != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::set( left, 1)\n";
+			return;
+		}
+	}
+
+	// test sparse unmasked
+	std::cout << "\b\b 10: ";
+	rc = clear( out );
+	rc = rc ? rc : clear( left );
+	rc = rc ? rc : setElement( left, 3, n / 2 );
+	rc = rc ? rc : eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 6 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// same test, now testing also for in-place semantics
+	std::cout << "\b\b 11: ";
+	rc = eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 12 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 12 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test sparse unmasked, other side
+	std::cout << "\b\b 12: ";
+	std::swap( left, right );
+	rc = clear( out );
+	rc = rc ? rc : eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 6 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// same test, now testing also for in-place semantics
+	std::cout << "\b\b 13: ";
+	rc = eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 12 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 12 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// sparse masked test
+	std::cout << "\b\b 14: ";
+	rc = clear( out );
+	rc = rc ? rc : eWiseMul( out, even_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	const bool halfLengthIsOdd = (n/2) % 2 == 1;
+	if( halfLengthIsOdd ) {
+		if( nnz( out ) != 0 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected 0\n";
+			rc = FAILED;
+		}
+	} else {
+		if( nnz( out ) != 1 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected 1\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &pair : out ) {
+		if( halfLengthIsOdd ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries\n";
+			rc = FAILED;
+		} else {
+			if( pair.first != n / 2 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected no entries at positions other than " << (n/2) << "\n";
+				rc = FAILED;
+			}
+			if( pair.second != 6 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected an entry with value 6 only\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// same test, possibly also checking for in-place semantics
+	std::cout << "\b\b 15: ";
+	rc = eWiseMul( out, odd_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 6 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// sparse masked test, switch sides and mask order
+	std::cout << "\b\b 16: ";
+	std::swap( left, right );
+	rc = clear( out );
+	rc = rc ? rc : eWiseMul( out, odd_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( halfLengthIsOdd ) {
+		if( nnz( out ) != 1 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected 1\n";
+			rc = FAILED;
+		}
+	} else {
+		if( nnz( out ) != 0 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected 0\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &pair : out ) {
+		if( halfLengthIsOdd ) {
+			if( pair.first != n / 2 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected no entries at positions other than " << (n/2) << "\n";
+				rc = FAILED;
+			}
+			if( pair.second != 6 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected an entry with value 6 only\n";
+				rc = FAILED;
+			}
+		} else {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// same test, possibly also checking for in-place semantics
+	std::cout << "\b\b 17: ";
+	rc = eWiseMul( out, even_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entries at positions other than " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		if( pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected an entry with value 6 only\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// masked dense test
+	std::cout << "\b\b 18: ";
+	rc = grb::set( left, 3.0 );
+	rc = rc ? rc : grb::set( right, 2.0 );
+	rc = rc ? rc : eWiseMul( out, odd_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( halfLengthIsOdd ) {
+		if( nnz( out ) != n / 2 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2) << "\n";
+			rc = FAILED;
+		}
+	} else {
+		if( nnz( out ) != n / 2 + 1 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2+1) << "\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &pair : out ) {
+		if( pair.first % 2 == 1 && pair.first != n / 2 && pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected entry with value 6 here\n";
+			rc = FAILED;
+		}
+		if( pair.first % 2 == 1 && pair.first == n / 2 && pair.second != 12 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected entries with value 12 at this position\n";
+			rc = FAILED;
+		}
+		if( pair.first % 2 == 0 ) {
+			if( pair.first == n / 2 ) {
+				if( pair.second != 6 ) {
+					std::cerr << "primitive returns an entry ( "
+						<< pair.first << ", " << pair.second << " ), "
+						<< "expected entries with value 6 at this position\n";
+					rc = FAILED;
+				}
+			} else {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// now use complementary mask to generate a dense vector
+	std::cout << "\b\b 19: ";
+	rc = eWiseMul( out, even_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 && pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected entry with value 6 here\n";
+			rc = FAILED;
+		}
+		if( pair.first == n / 2 && pair.second != 12 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected entry with value 12 here\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test scalar on the right, sparse, unmasked
+	std::cout << "\b\b 20: ";
+	const double scalar = 2.0;
+	rc = clear( out );
+	rc = rc ? rc : clear( left );
+	rc = rc ? rc : setElement( left, 3.0, n / 2 );
+	rc = rc ? rc : eWiseMul( out, left, scalar, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 1 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 1\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first != n / 2 && pair.second != 6 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected a single entry with value 6 at position " << (n/2) << "\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test scalar on the right, dense, unmasked
+	std::cout << "\b\b 21: ";
+	rc = set( right, 2.0 );
+	rc = rc ? rc : eWiseMul( out, right, scalar, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+			if( pair.second != 10 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected value 10 at this position\n";
+				rc = FAILED;
+			}
+		} else if( pair.second != 4 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected value 4 at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test scalar on the left, sparse, unmasked
+	std::cout << "\b\b 22: ";
+	rc = eWiseMul( out, scalar, left, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+		       if( pair.second != 16 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected an entry with value 16 at this position\n";
+				rc = FAILED;
+		       }
+		} else if( pair.second != 4 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected value 4 at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test scalar on the left, dense, unmasked
+	std::cout << "\b\b 23: ";
+	rc = eWiseMul( out, scalar, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+			if( pair.second != 20 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected an entry with value 20 at this position\n";
+				rc = FAILED;
+			}
+		} else if( pair.second != 8 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected value 8 at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// test scalars on both sides, unmasked and masked
+	{
+		std::cout << "\b\b 24: ";
+		double anotherScalar = 3.0;
+		rc = clear( out );
+		rc = rc ? rc : eWiseMul( out, scalar, anotherScalar, ring );
+		rc = rc ? rc : wait();
+		if( rc != SUCCESS ) {
+			std::cerr << "primitive returns " << toString( rc ) << ", "
+				<< "expected SUCCESS\n";
+			rc = FAILED;
+			return;
+		}
+		if( nnz( out ) != n ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << n << "\n";
+			rc = FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.second != 6 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected an entry with value 6 only\n";
+				rc = FAILED;
+			}
+		}
+		if( rc != SUCCESS ) { return; }
+		std::cout << "\b\b 25: ";
+		rc = clear( out );
+		rc = rc ? rc : eWiseMul( out, odd_mask, scalar, anotherScalar, ring );
+		rc = rc ? rc : wait();
+		if( rc != SUCCESS ) {
+			std::cerr << "primitive returns " << toString( rc ) << ", "
+				<< "expected SUCCESS\n";
+			rc = FAILED;
+			return;
+		}
+		if( nnz( out ) != n/2 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2) << "\n";
+			rc = FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.first % 2 == 0 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected entries at odd positions only\n";
+				rc = FAILED;
+			}
+			if( pair.first % 2 == 1 && pair.second != 6 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected entries with value 6 only\n";
+				rc = FAILED;
+			}
+		}
+		if( rc != SUCCESS ) { return; }
+	}
+
+	// check masked sparse vector<-vector<-scalar
+	std::cout << "\b\b 26: ";
+	if( halfLengthIsOdd ) {
+#ifdef _DEBUG
+		std::cerr << "performing test with odd mask, while n/2=" << (n/2) << "\n";
+#endif
+		rc = eWiseMul( out, odd_mask, left, scalar, ring );
+	} else {
+#ifdef _DEBUG
+		std::cerr << "performing test with even mask, while n/2=" << (n/2) << "\n";
+#endif
+		rc = eWiseMul( out, even_mask, left, scalar, ring );
+	}
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( halfLengthIsOdd ) {
+		if( nnz( out ) != n / 2 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2) << "\n";
+			rc = FAILED;
+		}
+	} else {
+		if( nnz( out ) != n/2+1 ) {
+			std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2+1) << "\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+			if( pair.first % 2 == 1 ) {
+				if( pair.second != 12 ) {
+					std::cerr << "primitive returns an entry ( "
+						<< pair.first << ", " << pair.second << " ), "
+						<< "expected this entry to have value 12\n";
+					rc = FAILED;
+				}
+			} else {
+				if( pair.second != 6 ) {
+					std::cerr << "primitive returns an entry ( "
+						<< pair.first << ", " << pair.second << " ), "
+						<< "expected this entry to have value 6\n";
+					rc = FAILED;
+				}
+			}
+		} else if( pair.first % 2 == 1 ) {
+			if( pair.second != 6 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected this entry to have value 6\n";
+				rc = FAILED;
+			}
+		} else {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entry at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// check masked dense vector<-vector<-scalar
+	std::cout << "\b\b 27: ";
+	rc = clear( out );
+	rc = rc ? rc : eWiseMul( out, even_mask, right, scalar, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n/2 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << (n/2) << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first % 2 == 1 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
+			rc = FAILED;
+		} else {
+			assert( pair.first % 2 == 0 );
+			if( pair.second != 4 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected the value 4\n";
+				rc = FAILED;
+			}
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// check masked sparse vector<-scalar<-vector
+	std::cout << "\b\b 28: ";
+	if( halfLengthIsOdd ) {
+		rc = eWiseMul( out, odd_mask, scalar, left, ring );
+	} else {
+		rc = eWiseMul( out, even_mask, scalar, left, ring );
+	}
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( halfLengthIsOdd ) {
+		if( nnz( out ) != n/2+1 ) {
+			std::cerr << "primitives returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2+1) << "\n";
+			rc = FAILED;
+		}
+	} else {
+		if( nnz( out ) != n/2 ) {
+			std::cerr << "primitives returns " << nnz( out ) << " nonzeroes, "
+				<< "expected " << (n/2) << "\n";
+			rc = FAILED;
+		}
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+			if( halfLengthIsOdd ) {
+				if( pair.second != 6 ) {
+					std::cerr << "primitive returns an entry ( "
+						<< pair.first << ", " << pair.second << " ), "
+						<< "expected the value 6 at this position\n";
+					rc = FAILED;
+				}
+			} else {
+				if( pair.second != 10 ) {
+					std::cerr << "primitive returns an entry ( "
+						<< pair.first << ", " << pair.second << " ), "
+						<< "expected the value 10 at this position\n";
+					rc = FAILED;
+				}
+			}
+		} else if( pair.first % 2 != 1 ) {
+			if( pair.second != 4 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected the value 4 at this position\n";
+				rc = FAILED;
+			}
+		} else {
+			assert( pair.first % 2 == 0 );
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected no entry at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// check masked dense vector<scalar<-vector
+	std::cout << "\b\b 29: ";
+	rc = eWiseMul( out, odd_mask, scalar, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != n ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected " << n << "\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		if( pair.first == n / 2 ) {
+			if( pair.second != 10 ) {
+				std::cerr << "primitive returns an entry ( "
+					<< pair.first << ", " << pair.second << " ), "
+					<< "expected the value 10 at this position\n";
+				rc = FAILED;
+			}
+		} else if( pair.second != 4 ) {
+			std::cerr << "primitive returns an entry ( "
+				<< pair.first << ", " << pair.second << " ), "
+				<< "expected the value 4 at this position\n";
+			rc = FAILED;
+		}
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// check masked with empty vector
+	std::cout << "\b\b 30: ";
+	rc = clear( out );
+	rc = rc ? rc : clear( left );
+	rc = rc ? rc : eWiseMul( out, even_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 0 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 0\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		std::cerr << "primitive returns an entry ( "
+			<< pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		rc = FAILED;
+	}
+	if( rc != SUCCESS ) { return; }
+	std::cout << "\b\b 31: ";
+	std::swap( left, right );
+	rc = eWiseMul( out, odd_mask, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 0 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 0\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		std::cerr << "primitive returns an entry ( "
+			<< pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		rc = FAILED;
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// the same without masks
+	std::cout << "\b\b 32: ";
+	std::swap( left, right );
+	rc = eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 0 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 0\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		std::cerr << "primitive returns an entry ( "
+			<< pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		rc = FAILED;
+	}
+	if( rc != SUCCESS ) { return; }
+	std::cout << "\b\b 33: ";
+	std::swap( left, right );
+	rc = eWiseMul( out, left, right, ring );
+	rc = rc ? rc : wait();
+	if( rc != SUCCESS ) {
+		std::cerr << "primitive returns " << toString( rc ) << ", "
+			<< "expected SUCCESS\n";
+		rc = FAILED;
+		return;
+	}
+	if( nnz( out ) != 0 ) {
+		std::cerr << "primitive returns " << nnz( out ) << " nonzeroes, "
+			<< "expected 0\n";
+		rc = FAILED;
+	}
+	for( const auto &pair : out ) {
+		std::cerr << "primitive returns an entry ( "
+			<< pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		rc = FAILED;
+	}
+	if( rc != SUCCESS ) { return; }
+
+	// done
+	std::cout << "\b\b OK\n";
+}
+
+int main( int argc, char ** argv ) {
+	// defaults
+	bool printUsage = false;
+	size_t in = 100;
+
+	// error checking
+	if( argc > 2 ) {
+		printUsage = true;
+	}
+	if( argc == 2 ) {
+		size_t read;
+		std::istringstream ss( argv[ 1 ] );
+		if( ! ( ss >> read ) ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( ! ss.eof() ) {
+			std::cerr << "Error parsing first argument\n";
+			printUsage = true;
+		} else if( read % 2 != 0 ) {
+			std::cerr << "Given value for n is odd\n";
+			printUsage = true;
+		} else {
+			// all OK
+			in = read;
+		}
+	}
+	if( printUsage ) {
+		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the "
+					 "test size.\n";
+		return 1;
+	}
+
+	std::cout << "This is functional test " << argv[ 0 ] << "\n";
+	Launcher< AUTOMATIC > launcher;
+	RC out;
+	if( launcher.exec( &grb_program, in, out, true ) != SUCCESS ) {
+		std::cerr << "Launching test FAILED\n";
+		return 255;
+	}
+	if( out != SUCCESS ) {
+		std::cerr << "Test FAILED (" << toString( out ) << ")" << std::endl;
+	} else {
+		std::cout << "Test OK" << std::endl;
+	}
+	return 0;
+}
+
diff --git a/tests/unit/ewiseapply.cpp b/tests/unit/ewiseapply.cpp
index fccc565b5..5afcae851 100644
--- a/tests/unit/ewiseapply.cpp
+++ b/tests/unit/ewiseapply.cpp
@@ -20,9 +20,8 @@
 
 #include <graphblas.hpp>
 
-using namespace grb;
 
-void grb_program( const size_t & n, grb::RC & rc ) {
+void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Vector< double > out( n ), left( n ), right( n );
 	grb::Vector< bool > mask( n );
 	grb::Vector< size_t > temp( n );
@@ -50,620 +49,925 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 			}
 		}, temp );
 	rc = rc ? rc : grb::set( mask, temp, true );
-	if( rc != SUCCESS ) {
+	rc = rc ? rc : grb::wait();
+	if( rc != grb::SUCCESS ) {
 		std::cerr << "\tinitialisation FAILED\n";
 		return;
 	}
 
 	grb::Monoid< grb::operators::add< double >, grb::identities::zero > plusM;
+	unsigned int test = 1;
 
 	// test operator versions first, dense vectors only, without masks
+	// [double] <- double <- double (OP, no mask)
+	rc = grb::eWiseApply( out, 0.25, 0.25, plusM.getOperator() );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
+				<< "), expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.second != 0.5 ) {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< " ); expected ( " << pair.first << ", 0.5 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
+			}
+		}
+		if( rc == grb::FAILED ) {
+			return;
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- double (OP, no mask)
 	rc = grb::eWiseApply( out, left, 0.25, plusM.getOperator() );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << " ), "
-				<< "expected " << size( out ) << " ) at subtest 1\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << " ), "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.second != 1.75 ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", "
 					<< pair.second << " ); expected ( " << pair.first << ", 1.75 ) "
-					<< "at subtest 1\n";
-				rc = FAILED;
+					<< "at subtest " << test << "1\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- double <- [double] (OP, no mask)
 	rc = grb::eWiseApply( out, 0.25, left, plusM.getOperator() );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) << " ) at subtest 2\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto pair : out ) {
 			if( pair.second != 1.75 ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected ( " << pair.first << ", 1.75 ) at subtest 2\n";
-				rc = FAILED;
+					<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (OP, no mask)
 	rc = grb::eWiseApply( out, left, left, plusM.getOperator() );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) << " ) at subtest 3\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.second != static_cast< double >( 3 ) ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected ( " << pair.first << ", 3 ) at subtest 3\n";
-				rc = FAILED;
+					<< "; expected ( " << pair.first << ", 3 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// operator versions, dense vectors only, with masks
+	// [double] <- double <- double (OP, masked)
+	rc = grb::eWiseApply( out, mask, 0.25, 0.25, plusM.getOperator() );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::nnz( mask ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.first < n / 2 ) {
+				if( pair.second != 0.5 ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< " ); expected ( " << pair.first << ", 0.5 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
+				}
+			} else {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< " ); expected no entry at this position at subtest " << test << "\n";
+				rc = grb::FAILED;
+			}
+			if( rc == grb::FAILED ) {
+				return;
+			}
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- double (OP, masked)
 	rc = grb::eWiseApply( out, mask, left, 0.25, plusM.getOperator() );
-	if( rc == SUCCESS ) {
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
-				<< " != " << grb::nnz( mask ) << " ) at subtest 4\n";
-			rc = FAILED;
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != 1.75 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< " ); expected ( " << pair.first << ", 1.75 ) at subtest 4\n";
-					rc = FAILED;
+						<< " ); expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 					return;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< " ); expected this index to be empty) at subtest 4\n";
-				rc = FAILED;
+					<< " ); expected no entry at this index at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
+
+	// [double] <- double <- [double] (OP, masked)
 	rc = grb::eWiseApply( out, mask, 0.25, left, plusM.getOperator() );
-	if( rc == SUCCESS ) {
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
-				<< " != " << grb::nnz( mask ) << " ) at subtest 5\n";
-			rc = FAILED;
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != 1.75 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 1.75 ) at subtest 5\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected this index to be empty) at subtest 5\n";
-				rc = FAILED;
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
+
+	// [double] <- [double] <- [double] (OP, masked)
 	rc = grb::eWiseApply( out, mask, left, left, plusM.getOperator() );
-	if( rc == SUCCESS ) {
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
-				<< " != " << grb::nnz( mask ) << " ) at subtest 6\n";
-			rc = FAILED;
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != static_cast< double >( 3 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 3 ) at subtest 6\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 3 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected this index to be empty) at subtest 6\n";
-				rc = FAILED;
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// monoid version, dense vectors, unmasked
+	// [double] <- double <- double (Monoid, no mask)
+	rc = grb::eWiseApply( out, 0.25, 0.25, plusM );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.second != 0.5 ) {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< " ); expected ( " << pair.first << ", 0.5 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
+			}
+		}
+		if( rc == grb::FAILED ) {
+			return;
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- double (Monoid, no mask)
 	rc = grb::eWiseApply( out, left, 0.25, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) << " ) at subtest 7\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.second != 1.75 ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected ( " << pair.first << ", 1.75 ) at subtest 7\n";
-				rc = FAILED;
+					<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- double <- [double] (Monoid, no mask)
 	rc = grb::eWiseApply( out, 0.25, left, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) << " ) at subtest 8\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.second != 1.75 ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected ( " << pair.first << ", 1.75 ) at subtest 8\n";
-				rc = FAILED;
+					<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+					<< "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
-	rc = grb::eWiseApply( out, left, left, plusM.getOperator() );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out )
-				<< ", expected " << size( out ) << " ) at subtest 9\n";
-			rc = FAILED;
+	// [double] <- [double] <- [double] (Monoid, no mask)
+	rc = grb::eWiseApply( out, left, left, plusM );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
+				<< ", expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.second != static_cast< double >( 3 ) ) {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected ( " << pair.first << ", 3 ) at subtest 9\n";
-				rc = FAILED;
+					<< "; expected ( " << pair.first << ", 3 ) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// monoid versions, dense vectors, with masks
-	rc = grb::eWiseApply( out, mask, left, 0.25, plusM );
-	if( rc == SUCCESS ) {
+	// [double] <- double <- double (Monoid, masked)
+	rc = grb::eWiseApply( out, mask, 0.25, 0.25, plusM );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
 				<< " != " << grb::nnz( mask ) << " ) at subtest 10\n";
-			rc = FAILED;
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.first < n / 2 ) {
+				if( pair.second != .5 ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< "; expected ( " << pair.first << ", 0.5 ) at subtest " << test << "\n";
+					rc = grb::FAILED;
+				}
+			} else {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
+			}
+		}
+		if( rc == grb::FAILED ) {
+			return;
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- double (Monoid, masked)
+	rc = grb::eWiseApply( out, mask, left, 0.25, plusM );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::nnz( mask ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != 1.75 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 1.75 ) at subtest 10\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected this index to be empty) at subtest 10\n";
-				rc = FAILED;
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
+
+	// [double] <- double <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, 0.25, left, plusM );
-	if( rc == SUCCESS ) {
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
-				<< " != " << grb::nnz( mask ) << " ) at subtest 11\n";
-			rc = FAILED;
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != 1.75 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 1.75 ) at subtest 11\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "11\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected this index to be empty) at subtest 11\n";
-				rc = FAILED;
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
+
+	// [double] <- [double] <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, left, left, plusM );
-	if( rc == SUCCESS ) {
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
 		if( grb::nnz( out ) != grb::nnz( mask ) ) {
 			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out )
-				<< " != " << grb::nnz( mask ) << " ) at subtest 12\n";
-			rc = FAILED;
+				<< " != " << grb::nnz( mask ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.second != static_cast< double >( 3 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 3 ) at subtest 12\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 3 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected this index to be empty) at subtest 12\n";
-				rc = FAILED;
+					<< "; expected this index to be empty) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// monoid version, sparse vectors, unmasked
+	// [double] <- [double] <- double (Monoid, no mask)
 	rc = grb::eWiseApply( out, right, 0.25, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) << " ) at subtest 13\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first % 2 == 0 ) {
 				if( pair.second != 0.5 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 0.5 ) at subtest 13\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 0.5 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				if( pair.second != 0.25 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 0.25 ) at subtest 13\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 0.25 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- double <- [double] (Monoid, no mask)
 	rc = grb::eWiseApply( out, 0.25, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << " ), "
-				<< "expected " << size( out ) << " ) at subtest 14\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << " ), "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first % 2 == 0 ) {
 				if( pair.second != 0.5 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< " ); expected ( " << pair.first << ", 0.5 ) at subtest 14\n";
-					rc = FAILED;
+						<< " ); expected ( " << pair.first << ", 0.5 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				if( pair.second != 0.25 ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< " ); expected ( " << pair.first << ", 0.25 ) at subtest 14\n";
-					rc = FAILED;
+						<< " ); expected ( " << pair.first << ", 0.25 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, no mask)
 	rc = grb::eWiseApply( out, left, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << " ), "
-				<< "expected " << size( out ) << " ) at subtest 15\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << " ), "
+				<< "expected " << grb::size( out ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto pair : out ) {
 			if( pair.first % 2 == 0 ) {
 				if( pair.second != static_cast< double >( 1.75 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< " ); expected ( " << pair.first << ", 1.75 ) at subtest 15\n";
-					rc = FAILED;
+						<< " ); expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				if( pair.second != static_cast< double >( 1.5 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< " ); expected ( " << pair.first << ", 1.5 ) at subtest 15\n";
-					rc = FAILED;
+						<< " ); expected ( " << pair.first << ", 1.5 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, no mask)
 	rc = grb::eWiseApply( out, right, left, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( right ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( right ) << " ) at subtest 16\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( right ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( right ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first % 2 == 0 ) {
 				if( pair.second != static_cast< double >( 1.75 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 1.75 ) at subtest 16\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				if( pair.second != static_cast< double >( 1.5 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 1.5 ) at subtest 16\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 1.5 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, no mask)
 	rc = grb::eWiseApply( out, right, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != nnz( right ) ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << nnz( right ) << " ) at subtest 17\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::nnz( right ) ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::nnz( right ) << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first % 2 == 0 ) {
 				if( pair.second != static_cast< double >( .5 ) ) {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected ( " << pair.first << ", 0.5 ) at subtest 17\n";
-					rc = FAILED;
+						<< "; expected ( " << pair.first << ", 0.5 ) at subtest " << test << "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this entry) at subtest 17\n";
-				rc = FAILED;
+					<< "; expected nothing at this entry) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// monoid version, sparse vectors, with masks
+	// [double] <- [double] <- double (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, right, 0.25, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) / 2 ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-			       << "expected " << size( out ) / 2 << " ) at subtest 18\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) / 2 ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+			       << "expected " << grb::size( out ) / 2 << " ) at subtest " << test
+			       << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.first % 2 == 0 ) {
 					if( pair.second != 0.5 ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 0.5 ) at subtest 18\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 0.5 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				} else {
 					if( pair.second != 0.25 ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 0.25 ) at subtest 18\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 0.25 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this index) at subtest 18\n";
-				rc = FAILED;
+					<< "; expected nothing at this index) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- double <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, 0.25, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) / 2 ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) / 2 << " ) at subtest 19\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) / 2 ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) / 2 << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.first % 2 == 0 ) {
 					if( pair.second != 0.5 ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 0.5 ) at subtest 19\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 0.5 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				} else {
 					if( pair.second != 0.25 ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 0.25 ) at subtest 19\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 0.25 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this index) at subtest 19\n";
-				rc = FAILED;
+					<< "; expected nothing at this index) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, left, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( out ) / 2 ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( out ) / 2 << " ) at subtest 20\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( out ) / 2 ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( out ) / 2 << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.first % 2 == 0 ) {
 					if( pair.second != static_cast< double >( 1.75 ) ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 1.75 ) at subtest 20\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				} else {
 					if( pair.second != static_cast< double >( 1.5 ) ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 1.5 ) at subtest 20\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 1.5 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this index) at subtest 20\n";
-				rc = FAILED;
+					<< "; expected nothing at this index) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, right, left, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != size( right ) / 2 ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << size( right ) / 2 << " ) at subtest 21\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != grb::size( right ) / 2 ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << grb::size( right ) / 2 << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.first % 2 == 0 ) {
 					if( pair.second != static_cast< double >( 1.75 ) ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 1.75 ) at subtest 21\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 1.75 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				} else {
 					if( pair.second != static_cast< double >( 1.5 ) ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 1.5 ) at subtest 21\n";
-						rc = FAILED;
+							<< "; expected ( " << pair.first << ", 1.5 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this index) at subtest 21\n";
-				rc = FAILED;
+					<< "; expected nothing at this index) at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
+	// [double] <- [double] <- [double] (Monoid, masked)
 	rc = grb::eWiseApply( out, mask, right, right, plusM );
-	if( rc == SUCCESS ) {
-		if( nnz( out ) != nnz( right ) / 2 ) {
-			std::cerr << "\tunexpected number of nonzeroes ( " << nnz( out ) << ", "
-				<< "expected " << nnz( right ) / 2 << " ) at subtest 22\n";
-			rc = FAILED;
+	assert( rc == grb::SUCCESS );
+	const bool halfIsOdd = ((n / 2) % 2) == 1;
+	if( rc == grb::SUCCESS ) {
+		const size_t expected = grb::nnz( right ) / 2 + (halfIsOdd ? 1 : 0);
+		if( grb::nnz( out ) != expected ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << expected << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
 		}
 		for( const auto &pair : out ) {
 			if( pair.first < n / 2 ) {
 				if( pair.first % 2 == 0 ) {
 					if( pair.second != static_cast< double >( .5 ) ) {
 						std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-							<< "; expected ( " << pair.first << ", 0.5 ) at subtest 22\n";
-						rc = FAILED;
+							<< " ), expected ( " << pair.first << ", 0.5 ) at subtest " << test
+							<< "\n";
+						rc = grb::FAILED;
 					}
 				} else {
 					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-						<< "; expected nothing at this entry) at subtest 22\n";
-					rc = FAILED;
+						<< " ), expected nothing at this entry at subtest " << test << "\n";
+					rc = grb::FAILED;
+				}
+			} else {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected nothing at this index at subtest " << test << "\n";
+				rc = grb::FAILED;
+			}
+		}
+		if( rc == grb::FAILED ) {
+			return;
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- [double] (Monoid, masked)
+	rc = clear( right );
+	rc = rc ? rc : clear( left );
+	rc = rc ? rc : setElement( right, 2.17, 0 );
+	rc = rc ? rc : setElement( right, 2.0, n/2 );
+	rc = rc ? rc : setElement( right, 3.14, n-1 );
+	rc = rc ? rc : setElement( left,  1.0, n-1 );
+	rc = rc ? rc : setElement( left, -1.0, 0 );
+	rc = rc ? rc : grb::wait();
+	if( rc != grb::SUCCESS ) {
+		std::cerr << "\tre-initialisation for further tests with sparse vectors "
+			<< "FAILED\n";
+		return;
+	}
+	rc = eWiseApply( out, mask, left, right, plusM );
+	assert( n % 2 == 0 );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		const size_t expect = 1;
+		if( grb::nnz( out ) != expect ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "), expected " << expect << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.first == 0 ) {
+				if( pair.second != 1.17 ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< " ), expected ( " << pair.first << ", 1.17 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
+				}
+			} else {
+				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected ( " << pair.first << ", " << (n/2) << " ) "
+					<< "at subtest " << test << "\n";
+				rc = grb::FAILED;
+			}
+		}
+		if( rc == grb::FAILED ) {
+			return;
+		}
+	} else {
+		return;
+	}
+	(void) ++test;
+
+	// [double] <- [double] <- [double] (Monoid, no mask)
+	rc = grb::eWiseApply( out, left, right, plusM );
+	assert( rc == grb::SUCCESS );
+	if( rc == grb::SUCCESS ) {
+		if( grb::nnz( out ) != 3 ) {
+			std::cerr << "\tunexpected number of nonzeroes ( " << grb::nnz( out ) << ", "
+				<< "expected " << 3 << " ) at subtest " << test << "\n";
+			rc = grb::FAILED;
+		}
+		for( const auto &pair : out ) {
+			if( pair.first == 0 ) {
+				if( pair.second != 1.17 ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< " ), expected ( " << pair.first << ", 1.17 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
+				}
+			} else if( pair.first == n / 2 ) {
+				if( pair.second != 2.0 ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< " ), expected ( " << pair.first << ", 2.0 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
+				}
+			} else if( pair.first == n - 1 ) {
+				if( !grb::utils::equals( pair.second, 4.14, 1 ) ) {
+					std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
+						<< " ), expected ( " << pair.first << ", 4.14 ) at subtest " << test
+						<< "\n";
+					rc = grb::FAILED;
 				}
 			} else {
 				std::cerr << "\tunexpected entry ( " << pair.first << ", " << pair.second
-					<< "; expected nothing at this index) at subtest 22\n";
-				rc = FAILED;
+					<< ", expected nothing at this index at subtest " << test << "\n";
+				rc = grb::FAILED;
 			}
 		}
-		if( rc == FAILED ) {
+		if( rc == grb::FAILED ) {
 			return;
 		}
 	} else {
 		return;
 	}
+	(void) ++test;
 
 	// those were all variants:)
 }
@@ -702,13 +1006,13 @@ int main( int argc, char ** argv ) {
 	}
 
 	std::cout << "This is functional test " << argv[ 0 ] << "\n";
-	grb::Launcher< AUTOMATIC > launcher;
+	grb::Launcher< grb::AUTOMATIC > launcher;
 	grb::RC out;
-	if( launcher.exec( &grb_program, in, out, true ) != SUCCESS ) {
+	if( launcher.exec( &grb_program, in, out, true ) != grb::SUCCESS ) {
 		std::cerr << "Launching test FAILED\n";
 		return 255;
 	}
-	if( out != SUCCESS ) {
+	if( out != grb::SUCCESS ) {
 		std::cerr << std::flush;
 		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
 	} else {
diff --git a/tests/unit/id.cpp b/tests/unit/id.cpp
index 25b7ac0aa..eb2b6daa2 100644
--- a/tests/unit/id.cpp
+++ b/tests/unit/id.cpp
@@ -15,7 +15,9 @@
  * limitations under the License.
  */
 
+#include <array>
 #include <iostream>
+
 #include <graphblas.hpp>
 
 
@@ -81,7 +83,7 @@ void grb_program1( const struct input &in, struct output &out ) {
 	const size_t threeID = grb::getID( three ); out.IDs[ 2 ] = threeID;
 	if( threeID != grb::getID( three ) ) {
 		std::cerr << "\t two calls to getID on the same container produce different "
-			<< "IDs (III)\n";
+			<< "IDs (III): " << threeID << " vs. " << grb::getID( three ) << "\n";
 		rc = grb::FAILED;
 		return;
 	}
diff --git a/tests/unit/launcher/reduce.cpp b/tests/unit/launcher/reduce.cpp
index 7d81bf832..9c5d9192a 100644
--- a/tests/unit/launcher/reduce.cpp
+++ b/tests/unit/launcher/reduce.cpp
@@ -47,7 +47,6 @@ int expect_mismatch(
 		error = true;
 	}
 
-
 	rc = foldr< descr >( v1, v0, alpha, mon );
 	if( rc != grb::MISMATCH ) {
 		std::cerr << "\t mismatched call to foldr ([T]->T, masked) "
@@ -161,6 +160,131 @@ int expect_illegal(
 	}
 }
 
+template< Descriptor descr, typename MonT >
+int expect_sparse_success(
+	grb::Vector< double > &xv,
+	MonT &mon,
+	const double check,
+	const grb::Vector< bool > &mask,
+	const double check_unmasked
+) {
+	const size_t nz = grb::nnz( xv );
+	std::cout << "\nStarting functional tests for sparse inputs\n"
+		<< "\t descriptor: " << descr << "\n"
+		<< "\t nonzeroes:  " << nz << "\n"
+		<< "\t checksum 1: " << check << "\n"
+		<< "\t checksum 2: " << check_unmasked << "\n"
+		<< "\t mask:       ";
+	if( grb::size( mask ) > 0 ) {
+		std::cout << grb::nnz( mask ) << " elements.\n";
+	} else {
+		std::cout << "none.\n";
+	}
+
+	double alpha = 3.14;
+	if( grb::foldl< descr >( alpha, xv, mask, mon ) != grb::SUCCESS ) {
+#ifndef NDEBUG
+		const bool sparse_foldl_into_scalar_failed = false;
+		assert( sparse_foldl_into_scalar_failed );
+#endif
+		return 41;
+	}
+
+	double alpha_unmasked = 2.17;
+	if( grb::foldl< descr >( alpha_unmasked, xv, mon ) != grb::SUCCESS ) {
+#ifndef NDEBUG
+		const bool sparse_foldl_into_scalar_unmasked_failed = false;
+		assert( sparse_foldl_into_scalar_unmasked_failed );
+#endif
+		return 46;
+	}
+
+	double alpha_right = -2.22;
+	if( grb::foldr< descr >( xv, mask, alpha_right, mon ) != grb::SUCCESS ) {
+#ifndef NDEBUG
+		const bool sparse_foldr_into_scalar_failed = false;
+		assert( sparse_foldr_into_scalar_failed );
+#endif
+		return 51;
+	}
+
+	double alpha_right_unmasked = -check;
+	if( grb::foldr< descr >( xv, alpha_right_unmasked, mon ) != grb::SUCCESS ) {
+#ifndef NDEBUG
+		const bool sparse_foldr_into_scalar_unmasked_failed = false;
+		assert( sparse_foldr_into_scalar_unmasked_failed );
+#endif
+		return 61;
+	}
+
+	// verify computations
+	alpha -= 3.14;
+	alpha_unmasked -= 2.17;
+	alpha_right += 2.22;
+	alpha_right_unmasked += check;
+	bool error = false;
+	if( !grb::utils::equals( alpha, alpha_right, nz+1 ) ) {
+		std::cerr << "Error: " << alpha_right << " (sparse foldr, masked) "
+			<< " does not equal " << alpha << " (sparse foldl, masked).\n";
+		error = true;
+	}
+	if( !grb::utils::equals( alpha_unmasked, alpha_right_unmasked, nz+1 ) ) {
+		std::cerr << "Error: " << alpha_unmasked << " (sparse foldl, unmasked) "
+			<< "does not equal " << alpha_right_unmasked << " "
+			<< "(sparse foldr, unmasked).\n";
+		error = true;
+	}
+	if( size( mask ) == 0 ) {
+		if( !grb::utils::equals( alpha, alpha_right_unmasked, nz+1 ) ) {
+			std::cerr << "Error: " << alpha_right_unmasked << " "
+				<< "(sparse foldr, unmasked) does not equal " << alpha << " "
+				<< "(sparse foldl, masked).\n";
+			error = true;
+		}
+		if( !grb::utils::equals( alpha, alpha_unmasked, nz+1 ) ) {
+			std::cerr << "Error: " << alpha_unmasked << " (sparse foldl, unmasked) "
+				<< " does not equal " << alpha << " (sparse foldl, masked).\n";
+			error = true;
+		}
+	}
+	if( !grb::utils::equals( check, alpha, nz == 0 ? 1 : nz ) ) {
+		std::cerr << "Error: " << alpha << " does not equal given checksum " << check
+			<< ".\n";
+		error = true;
+	}
+	if( size( mask ) > 0 ) {
+		if( !grb::utils::equals( alpha_unmasked, check_unmasked, nz+1 ) ) {
+			std::cerr << "Error: " << alpha_unmasked << " does not equal given unmasked "
+				<< "checksum " << check_unmasked << ".\n";
+			error = true;
+		}
+		if( !grb::utils::equals( alpha_right_unmasked, check_unmasked, nz+1 ) ) {
+			std::cerr << "Error: " << alpha_right_unmasked << " does not equal given "
+				<< "unmasked checksum " << check_unmasked << ".\n";
+			error = true;
+		}
+	}
+	if( !error ) {
+		if( grb::spmd<>::pid() == 0 ) {
+			std::cout << "Sparse functional tests complete.\n";
+		}
+	} else {
+		std::cerr << std::flush;
+		return 71;
+	}
+	return 0;
+}
+
+template< Descriptor descr, typename MonT >
+int expect_sparse_success(
+	grb::Vector< double > &xv,
+	MonT &mon,
+	const double check,
+	const grb::Vector< bool > mask = NO_MASK
+) {
+	return expect_sparse_success< descr, MonT >( xv, mon, check, mask, check );
+}
+
 template< Descriptor descr, typename MonT >
 int expect_success(
 	grb::Vector< double > &xv,
@@ -372,6 +496,191 @@ void grbProgram( const size_t &P, int &exit_status ) {
 		return;
 	}
 
+	// do similar happy path testing, but now for sparse inputs
+	{
+		grb::Vector< double > sparse( n ), empty( n ), single( n ), singleFirst( n );
+		grb::Vector< bool > empty_mask( n ), odd_mask( n ), half_mask( n ), full( n );
+		grb::RC rc = grb::set( sparse, even_mask, 1.0 );
+		assert( rc == grb::SUCCESS );
+		rc = rc ? rc : grb::set( full, true );
+		assert( rc == grb::SUCCESS );
+		rc = rc ? rc : grb::setElement( single, 3.141, n/2 );
+		assert( rc == grb::SUCCESS );
+		rc = rc ? rc : grb::setElement( singleFirst, -1.7, 0 );
+		assert( rc == grb::SUCCESS );
+		rc = rc ? rc : grb::setElement( half_mask, true, n/2 );
+		assert( rc == grb::SUCCESS );
+		for( size_t i = 1; rc == grb::SUCCESS && i < n; i += 2 ) {
+			rc = grb::setElement( odd_mask, true, i );
+		}
+		assert( rc == grb::SUCCESS );
+		if( rc != grb::SUCCESS ) {
+			std::cerr << "Could not initialise for sparse tests\n";
+			exit_status = 31;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			empty, realm, 0.0 );
+		if( exit_status != 0 ) {
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			empty, realm, 0.0, even_mask );
+		if( exit_status != 0 ) {
+			exit_status += 100;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			sparse, realm, grb::nnz(sparse) );
+		if( exit_status != 0 ) {
+			exit_status += 200;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			sparse, realm, 0.0, empty_mask, grb::nnz(sparse) );
+		if( exit_status != 0 ) {
+			exit_status += 300;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::structural >(
+			sparse, realm, 0.0, empty_mask, grb::nnz(sparse) );
+		if( exit_status != 0 ) {
+			exit_status += 400;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::invert_mask >(
+			sparse, realm, grb::nnz(sparse), empty_mask );
+		if( exit_status != 0 ) {
+			exit_status += 500;
+			return;
+		}
+
+		exit_status = expect_sparse_success<
+			grb::descriptors::invert_mask | grb::descriptors::structural
+		>(
+			sparse, realm, grb::nnz(sparse), empty_mask
+		);
+		if( exit_status != 0 ) {
+			exit_status += 600;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			sparse, realm, grb::nnz(sparse), even_mask );
+		if( exit_status != 0 ) {
+			exit_status += 700;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+			sparse, realm, 0.0, odd_mask, grb::nnz(sparse) );
+		if( exit_status != 0 ) {
+			exit_status += 800;
+			return;
+		}
+
+		exit_status = expect_sparse_success<
+			grb::descriptors::no_operation | grb::descriptors::structural
+		>(
+			sparse, realm, grb::nnz(sparse), even_mask
+		);
+		if( exit_status != 0 ) {
+			exit_status += 900;
+			return;
+		}
+
+		exit_status = expect_sparse_success<
+			grb::descriptors::no_operation | grb::descriptors::structural
+		>(
+			sparse, realm, 0.0, odd_mask, grb::nnz(sparse)
+		);
+		if( exit_status != 0 ) {
+			exit_status += 1000;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::invert_mask >(
+			sparse, realm, grb::nnz(sparse), odd_mask );
+		if( exit_status != 0 ) {
+			exit_status += 1100;
+			return;
+		}
+
+		exit_status = expect_sparse_success<
+			grb::descriptors::invert_mask | grb::descriptors::structural
+		>(
+			sparse, realm, grb::nnz(sparse), odd_mask
+		);
+		if( exit_status != 0 ) {
+			exit_status += 1200;
+			return;
+		}
+
+		exit_status = expect_sparse_success< grb::descriptors::structural >(
+			single, realm, 3.141, full );
+		if( exit_status != 0 ) {
+			exit_status += 1300;
+			return;
+		}
+
+		{
+			grb::Vector< bool > tmpMask( n );
+			rc = grb::setElement( tmpMask, true, 0 );
+			rc = rc ? rc : grb::setElement( tmpMask, false, n / 2 );
+			if( rc != grb::SUCCESS ) {
+				exit_status = 1401;
+				return;
+			}
+			exit_status = expect_sparse_success<
+				grb::descriptors::structural | grb::descriptors::invert_mask
+			>(
+				singleFirst, realm, 0, tmpMask, -1.7
+			);
+			if( exit_status != 0 ) {
+				exit_status += 1400;
+				return;
+			}
+		}
+
+		// warning: below set of two tests alter half_mask
+		{
+			double expect = (n/2) % 2 == 0
+				? 1.0
+				: 0.0;
+			exit_status = expect_sparse_success< grb::descriptors::structural >(
+				sparse, realm, expect, half_mask, grb::nnz(sparse) );
+			if( exit_status != 0 ) {
+				exit_status += 1500;
+				return;
+			}
+
+			static_assert( n > 1, "These tests require n=2 or larger" );
+			grb::RC rc = grb::setElement( half_mask, false, n/2 );
+			rc = rc ? rc : grb::setElement( half_mask, true, n/2 + 1 );
+			if( rc == grb::SUCCESS ) {
+				expect = (n/2+1) % 2 == 0
+					? 1.0
+					: 0.0;
+				exit_status = expect_sparse_success< grb::descriptors::no_operation >(
+					sparse, realm, expect, half_mask, grb::nnz(sparse) );
+			} else {
+				exit_status = 1632;
+				return;
+			}
+
+			if( exit_status != 0 ) {
+				exit_status += 1600;
+				return;
+			}
+		}
+	}
+
 	// check whether ILLEGAL is returned when appropriate
 	{
 		grb::Vector< double > half_sparse( n );
@@ -417,6 +726,7 @@ void grbProgram( const size_t &P, int &exit_status ) {
 	}
 
 	// done
+	std::cout << "\n";
 	assert( exit_status == 0 );
 }
 
diff --git a/tests/unit/masked_muladd.cpp b/tests/unit/masked_muladd.cpp
index 933b107a8..6e481513d 100644
--- a/tests/unit/masked_muladd.cpp
+++ b/tests/unit/masked_muladd.cpp
@@ -20,9 +20,10 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
-void grb_program( const size_t & n, grb::RC & rc ) {
+void grb_program( const size_t &n, grb::RC &rc ) {
 	const double alpha = 1.5;
 	const double beta = 3.14;
 	const double gamma = 2.718;
@@ -61,19 +62,23 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	// Test 1: vector-vector-vector-vector
 	rc = grb::eWiseMulAdd( z, m, a, x, y, ring );
 	if( rc != SUCCESS ) {
-		std::cerr << "Call to grb::eWiseMulAdd, test I, failed: " << grb::toString( rc ) << "\n";
+		std::cerr << "Call to grb::eWiseMulAdd, test I, failed: "
+			<< grb::toString( rc ) << "\n";
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z )
+				<< ", expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != alpha * beta + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( alpha * beta + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( alpha * beta + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -91,15 +96,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z )
+				<< ", expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != alpha * beta + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( alpha * beta + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( alpha * beta + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -117,15 +125,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", "
+				<< "expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != beta * alpha + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( beta * alpha + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( beta * alpha + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -143,15 +154,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", "
+				<< "expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != beta * alpha + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( beta * alpha + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( beta * alpha + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -169,15 +183,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", "
+				<< "expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != beta * alpha + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( beta * alpha + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( beta * alpha + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -195,15 +212,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z )
+				<< ", expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != beta * alpha + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( beta * alpha + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( beta * alpha + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -217,19 +237,23 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	// Test 7: vector-scalar-scalar-vector
 	rc = grb::eWiseMulAdd( z, m, alpha, beta, y, ring );
 	if( rc != SUCCESS ) {
-		std::cerr << "Call to grb::eWiseMulAdd, test VII, failed: " << grb::toString( rc ) << "\n";
+		std::cerr << "Call to grb::eWiseMulAdd, test VII, failed: "
+			<< grb::toString( rc ) << "\n";
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z )
+				<< ", expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != alpha * beta + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( alpha * beta + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( alpha * beta + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -247,15 +271,18 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = FAILED;
 	} else {
 		if( grb::nnz( z ) != n / 2 ) {
-			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", expected " << ( n / 2 ) << "\n";
+			std::cerr << "Unexpected number of nonzeroes: " << grb::nnz( z ) << ", "
+				<< "expected " << ( n / 2 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : z ) {
+		for( const auto &pair : z ) {
 			if( pair.first % 2 == 1 ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry here\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected no entry here\n";
 				rc = FAILED;
 			} else if( pair.second != alpha * beta + gamma ) {
-				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << ( alpha * beta + gamma ) << "\n";
+				std::cerr << "Unexpected entry ( " << pair.first << ", " << pair.second
+					<< " ), expected value " << ( alpha * beta + gamma ) << "\n";
 				rc = FAILED;
 			}
 		}
@@ -297,8 +324,8 @@ int main( int argc, char ** argv ) {
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
-		std::cerr << "  -n (optional, default is 100): an even integer, the "
-					 "test size.\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, the test "
+			<< "size.\n";
 		return 1;
 	}
 
@@ -310,9 +337,11 @@ int main( int argc, char ** argv ) {
 		return 255;
 	}
 	if( out != SUCCESS ) {
-		std::cerr << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		std::cerr << std::flush;
+		std::cout << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
 	} else {
 		std::cout << "Test OK" << std::endl;
 	}
 	return 0;
 }
+
diff --git a/tests/unit/moveVector.cpp b/tests/unit/moveVector.cpp
index 6de5a739b..a34677fdc 100644
--- a/tests/unit/moveVector.cpp
+++ b/tests/unit/moveVector.cpp
@@ -20,6 +20,7 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
 void grb_program( const size_t & n, grb::RC & rc ) {
diff --git a/tests/unit/pinnedVector.cpp b/tests/unit/pinnedVector.cpp
index d9bf070e4..c9a4ce7b4 100644
--- a/tests/unit/pinnedVector.cpp
+++ b/tests/unit/pinnedVector.cpp
@@ -102,7 +102,7 @@ static inline bool checkSparse(
 		case MOST_SPARSE:
 			if( i != n/2 ) {
 				std::cerr << "Nonzero at position " << i << ", expected " << n/2 << "\n";
-				return false;		
+				return false;
 			}
 			break;
 		case TWO_ENTRIES:
@@ -135,7 +135,7 @@ static inline bool checkSparse(
 
 template< typename T >
 void grbProgram( const struct input< T > &in, struct output< T > &out ) {
-	// create container 
+	// create container
 	constexpr const size_t zero = 0;
 	Vector< T > empty( zero ), nonempty( n ), zero_cap( n, zero );
 	srand( 15124 );
diff --git a/tests/unit/set.cpp b/tests/unit/set.cpp
index a381a4243..44d06b5a5 100644
--- a/tests/unit/set.cpp
+++ b/tests/unit/set.cpp
@@ -20,23 +20,420 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
-void grb_program( const size_t & n, grb::RC & rc ) {
+static grb::RC dense_tests(
+	grb::Vector< double > &dst,
+	grb::Vector< double > &src
+) {
+	// for the subtests that return ILLEGAL due to incorrect usage of the dense
+	// descriptor and in the case of nonblocking execution, the output vector may
+	// be modified due to side effects. Therefore, for some of the subtests below,
+	// the ouput vector is reset while some additional checks are disabled
+	constexpr bool nonblocking_execution = Properties<>::isNonblockingExecution;
+
+	assert( size( dst ) == size( src ) );
+	grb::Vector< bool > full_mask( size( dst ) ), one_mask( size( dst ) );
+	grb::RC ret = grb::set( full_mask, false );
+	ret = ret ? ret : grb::setElement( one_mask, false, size( dst ) / 2 );
+	ret = ret ? ret : grb::clear( src );
+	ret = ret ? ret : grb::clear( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << "\t initalisation of dense tests FAILED\n";
+		return ret;
+	}
+
+	std::cerr << "\t dense subtest 1:";
+	ret = grb::setElement< descriptors::dense >( src, 3.14, 0 );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != 0 ) {
+		std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+		return FAILED;
+	}
+
+	std::cerr << "\b 2:";
+	ret = grb::set< descriptors::dense >( dst, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 3:";
+	ret = grb::set< descriptors::dense >( dst, one_mask, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 4:";
+	ret = grb::set< descriptors::dense >( dst, full_mask, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 5:";
+	ret = grb::set< descriptors::dense >( dst, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 6:";
+	ret = grb::set< descriptors::dense >( dst, one_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 7:";
+	ret = grb::set< descriptors::dense >( dst, full_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 8:";
+	ret = grb::set( src, 3.14 );
+	ret = ret ? ret : grb::set< descriptors::dense >( dst, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 9:";
+	ret = grb::set< descriptors::dense >( dst, one_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 10:";
+	ret = grb::set< descriptors::dense >( dst, full_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::clear( dst );
+		if( ret != SUCCESS ) {
+			std::cerr << " unexpected failure of grb::clear( dst )\n";
+			return FAILED;
+		}
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+			return FAILED;
+		}
+	}
+
+	std::cerr << "\b 11:";
+	ret = grb::set( dst, 0 );
+	ret = ret ? ret : grb::set< descriptors::dense >( dst, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != size( dst ) ) {
+		std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		if( pair.second != 1.0 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected 1\n";
+			ret = FAILED;
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 12:";
+	ret = grb::set( dst, 0 );
+	ret = ret ? ret : grb::set< descriptors::dense >( dst, one_mask, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::set( dst, 0 );
+	} else {
+		ret = SUCCESS;
+		if( nnz( dst ) != size( dst ) ) {
+			std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+			ret = FAILED;
+		}
+		for( const auto &pair : dst ) {
+			if( pair.second != 0.0 ) {
+				std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+					<< "expected 0\n";
+				ret = FAILED;
+			}
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 13:";
+	ret = grb::set< descriptors::dense >( dst, full_mask, 1.0 );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != 0 ) {
+		std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		ret = FAILED;
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 14:";
+	ret = grb::set( dst, 0 );
+	ret = ret ? ret : grb::set< descriptors::dense | descriptors::invert_mask >(
+		dst, full_mask, 1.0
+	);
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != size( dst ) ) {
+		std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		if( pair.second != 1.0 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected entry with value 1\n";
+			ret = FAILED;
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 15:";
+	ret = grb::set< descriptors::dense >( dst, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != size( dst ) ) {
+		std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		if( pair.second != 3.14 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected 3.14\n";
+			ret = FAILED;
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 16:";
+	ret = grb::set( dst, 0 );
+	ret = ret ? ret : grb::set< descriptors::dense >( dst, one_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != ILLEGAL ) {
+		std::cerr << " expected ILLEGAL, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nonblocking_execution ) {
+		ret = grb::set( dst, 0 );
+	} else {
+		ret = SUCCESS;
+		if( nnz( dst ) != size( dst ) ) {
+			std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+			ret = FAILED;
+		}
+		for( const auto &pair : dst ) {
+			if( pair.second != 0 ) {
+				std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+					<< "expected zero values\n";
+				ret = FAILED;
+			}
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 17:";
+	ret = grb::set< descriptors::dense >( dst, full_mask, src );
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != 0 ) {
+		std::cerr << " expected 0, got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+			<< "expected no entries\n";
+		ret = FAILED;
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b 18:";
+	ret = grb::set( dst, 0 );
+	ret = ret ? ret : grb::set< descriptors::dense | descriptors::invert_mask >(
+		dst, full_mask, src
+	);
+	ret = ret ? ret : grb::wait( dst );
+	if( ret != SUCCESS ) {
+		std::cerr << " expected SUCCESS, got " << toString( ret ) << "\n";
+		return FAILED;
+	}
+	if( nnz( dst ) != size( dst ) ) {
+		std::cerr << " expected " << size( dst ) << ", got " << nnz( dst ) << "\n";
+		ret = FAILED;
+	}
+	for( const auto &pair : dst ) {
+		if( pair.second != 3.14 ) {
+			std::cerr << "\t got ( " << pair.first << ", " << pair.second << " ), "
+				<< "expected 3.14\n";
+			ret = FAILED;
+		}
+	}
+	if( ret != SUCCESS ) { return ret; }
+
+	std::cerr << "\b OK\n";
+	return SUCCESS;
+}
+
+void grb_program( const size_t &n, grb::RC &rc ) {
 	grb::Vector< double > dst( n ), src( n );
 
 	// test set
 	rc = grb::set( src, 1.5 ); // src = 1.5 everywhere
+	rc = rc ? rc : grb::wait( src );
 	if( rc != SUCCESS ) {
 		std::cerr << "\tset-to-value FAILED\n";
 	} else {
 		if( nnz( src ) != n ) {
-			std::cerr << "\t (set-to-value) unexpected number of nonzeroes " << nnz( src ) << ", expected " << n << "\n";
+			std::cerr << "\t (set-to-value) unexpected number of nonzeroes "
+				<< nnz( src ) << ", expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : src ) {
+		for( const auto &pair : src ) {
 			if( pair.second != 1.5 ) {
-				std::cerr << "\t (set-to-value) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value 1.5.\n";
+				std::cerr << "\t (set-to-value) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5.\n";
 				rc = FAILED;
 			}
 		}
@@ -47,16 +444,20 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 
 	// test set-to-index
 	rc = grb::set< grb::descriptors::use_index >( dst, 2 );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
 		std::cerr << "\tset-to-index FAILED\n";
 	} else {
 		if( nnz( dst ) != n ) {
-			std::cerr << "\t (set-to-index) unexpected number of nonzeroes " << nnz( dst ) << ", expected " << n << "\n";
+			std::cerr << "\t (set-to-index) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.second != pair.first ) {
-				std::cerr << "\t (set-to-index) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value " << static_cast< double >( pair.first ) << ".\n";
+				std::cerr << "\t (set-to-index) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value " << static_cast< double >( pair.first ) << ".\n";
 				rc = FAILED;
 			}
 		}
@@ -67,16 +468,21 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 
 	// test set overwrite
 	rc = grb::set( dst, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Set-overwrite FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Set-overwrite FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != n ) {
-			std::cerr << "\t (set-overwrite) unexpected number of nonzeroes " << nnz( dst ) << ", expected " << n << "\n";
+			std::cerr << "\t (set-overwrite) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.second != 1.5 ) {
-				std::cerr << "\t (set-overwrite) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value 1.5\n";
+				std::cerr << "\t (set-overwrite) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5\n";
 				rc = FAILED;
 			}
 		}
@@ -87,19 +493,22 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 
 	// test set into cleared
 	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::set( dst, src );
-	}
+	rc = rc ? rc : grb::set( dst, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Set-into-cleared FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Set-into-cleared FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != n ) {
-			std::cerr << "\t (set-into-cleared) unexpected number of nonzeroes " << nnz( dst ) << ", expected " << n << "\n";
+			std::cerr << "\t (set-into-cleared) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.second != 1.5 ) {
-				std::cerr << "\t (set-into-cleared) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value 1.5\n";
+				std::cerr << "\t (set-into-cleared) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5\n";
 				rc = FAILED;
 			}
 		}
@@ -110,26 +519,28 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 
 	// test masked set
 	rc = grb::setElement( src, 0, n / 2 );
-	if( rc == SUCCESS ) {
-		rc = grb::clear( dst );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::set( dst, src, src );
-	}
+	rc = rc ? rc : grb::set( dst, src, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Masked-set FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Masked-set FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != n - 1 ) {
-			std::cerr << "\t (masked-set) unexpected number of nonzeroes " << nnz( dst ) << ", expected " << ( n - 1 ) << "\n";
+			std::cerr << "\t (masked-set) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected " << ( n - 1 ) << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.first != n / 2 && pair.second != 1.5 ) {
-				std::cerr << "\t (masked-set) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected value 1.5\n";
+				std::cerr << "\t (masked-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5\n";
 				rc = FAILED;
 			}
 			if( pair.first == n / 2 ) {
-				std::cerr << "\t (masked-set) unexpected entry ( " << pair.first << ", " << pair.second << " ), expected no entry at this position\n";
+				std::cerr << "\t (masked-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -139,26 +550,28 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	}
 
 	// test inverted-mask set
-	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::set< grb::descriptors::invert_mask >( dst, src, src );
-	}
+	rc = grb::set< grb::descriptors::invert_mask >( dst, src, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Inverted-mask set FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Inverted-mask set FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != 1 ) {
-			std::cerr << "\t (inverted-mask-set) unexpected number of "
-						 "nonzeroes "
-					  << nnz( dst ) << ", expected 1.\n";
+			std::cerr << "\t (inverted-mask-set) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 1.\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.first == n / 2 && pair.second != 0 ) {
-				std::cerr << "\t (inverted-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << ": expected value 0\n";
+				std::cerr << "\t (inverted-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 0\n";
 				rc = FAILED;
 			}
 			if( pair.first != n / 2 ) {
-				std::cerr << "\t (inverted-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected no entry at this position\n";
+				std::cerr << "\t (inverted-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -168,30 +581,30 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	}
 
 	// test sparse mask set
-	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::clear( src );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::setElement( src, 1.5, n / 2 );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::set( dst, src, src );
-	}
+	rc = grb::clear( src );
+	rc = rc ? rc : grb::setElement( src, 1.5, n / 2 );
+	rc = rc ? rc : grb::set( dst, src, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse-mask set FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse-mask set FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != 1 ) {
-			std::cerr << "\t (sparse-mask-set) unexpected number of nonzeroes " << nnz( dst ) << ", expected 1.\n";
+			std::cerr << "\t (sparse-mask-set) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 1.\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.first == n / 2 && pair.second != 1.5 ) {
-				std::cerr << "\t (sparse-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 1.5\n";
+				std::cerr << "\t (sparse-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5\n";
 				rc = FAILED;
 			}
 			if( pair.first != n / 2 ) {
-				std::cerr << "\t (sparse-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected no entry at this position\n";
+				std::cerr << "\t (sparse-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -204,20 +617,27 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	rc = grb::clear( src );
 	rc = rc ? rc : grb::setElement( src, 1.5, 0 );
 	rc = rc ? rc : grb::set( dst, src, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse-mask set (re-entrance) FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse-mask set (re-entrance) FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
-		if( nnz( dst ) != 2 ) {
-			std::cerr << "\t (sparse-mask-set-reentrant) unexpected number of nonzeroes " << nnz( dst ) << ", expected 1.\n";
+		if( nnz( dst ) != 1 ) {
+			std::cerr << "\t (sparse-mask-set-reentrant) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 1.\n";
 			rc = FAILED;
 		}
 		for( const auto &pair : dst ) {
-			if( (pair.first == 0 || pair.first == n/2)  && pair.second != 1.5 ) {
-				std::cerr << "\t (sparse-mask-set-reentrant) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 1.5\n";
+			if( pair.first == 0 && pair.second != 1.5 ) {
+				std::cerr << "\t (sparse-mask-set-reentrant) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5\n";
 				rc = FAILED;
 			}
-			if( pair.first != 0 && pair.first != n/2 ) {
-				std::cerr << "\t (sparse-mask-set-reentrant) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected no entry at this position\n";
+			if( pair.first != 0 ) {
+				std::cerr << "\t (sparse-mask-set-reentrant) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -227,30 +647,29 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	}
 
 	// test sparse mask set to scalar
-	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::clear( src );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::setElement( src, 1.5, n / 2 );
-	}
-	if( rc == SUCCESS ) {
-		rc = grb::set( dst, src, 3.0 );
-	}
+	rc = rc ? rc : grb::clear( src );
+	rc = rc ? rc : grb::setElement( src, 1.5, n / 2 );
+	rc = rc ? rc : grb::set( dst, src, 3.0 );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse-mask set to scalar FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse-mask set to scalar FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != 1 ) {
-			std::cerr << "\t (sparse-mask-set-scalar) unexpected number of nonzeroes " << nnz( dst ) << ", expected 1.\n";
+			std::cerr << "\t (sparse-mask-set-scalar) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 1.\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.first == n / 2 && pair.second != 3.0 ) {
-				std::cerr << "\t (sparse-mask-set-to-scalar) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 3.0\n";
+				std::cerr << "\t (sparse-mask-set-to-scalar) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), expected value 3.0\n";
 				rc = FAILED;
 			}
 			if( pair.first != n / 2 ) {
-				std::cerr << "\t (sparse-mask-set-to-scalar) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected no entry at this position\n";
+				std::cerr << "\t (sparse-mask-set-to-scalar) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -263,20 +682,27 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	rc = grb::clear( src );
 	rc = rc ? rc : grb::setElement( src, 1.5, 0 );
 	rc = rc ? rc : grb::set( dst, src, 3.0 );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse-mask set to scalar (re-entrant) FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse-mask set to scalar (re-entrant) FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
-		if( nnz( dst ) != 2 ) {
-			std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected number of nonzeroes " << nnz( dst ) << ", expected 1.\n";
+		if( nnz( dst ) != 1 ) {
+			std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected number of "
+				<< "nonzeroes " << nnz( dst ) << ", expected 1.\n";
 			rc = FAILED;
 		}
 		for( const auto &pair : dst ) {
-			if( (pair.first == 0 || pair.first == n/2) && pair.second != 3.0 ) {
-				std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 3.0\n";
+			if( pair.first == 0 && pair.second != 3.0 ) {
+				std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 3.0\n";
 				rc = FAILED;
 			}
-			if( pair.first != 0 && pair.first != n/2 ) {
-				std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected no entry at this position\n";
+			if( pair.first != 0 ) {
+				std::cerr << "\t (sparse-mask-set-scalar-reentrant) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected no entry at this position\n";
 				rc = FAILED;
 			}
 		}
@@ -286,19 +712,15 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	}
 
 	// test sparse inverted mask set to empty
-	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::set< grb::descriptors::invert_mask >( dst, src, src );
-	}
+	rc = grb::set< grb::descriptors::invert_mask >( dst, src, src );
+	rc = rc ? rc : grb::wait( dst );
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse-inverted-mask set to empty FAILED with error "
-					 "code "
-				  << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse-inverted-mask set to empty FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != 0 ) {
-			std::cerr << "\t (sparse-inverted-mask-set-empty) unexpected "
-						 "number of nonzeroes "
-					  << nnz( dst ) << ", expected 0.\n";
+			std::cerr << "\t (sparse-inverted-mask-set-empty) unexpected number of "
+				<< "nonzeroes " << nnz( dst ) << ", expected 0.\n";
 			rc = FAILED;
 		}
 	}
@@ -308,36 +730,87 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 
 	// test sparse inverted mask set
 	grb::Vector< bool > mask( n );
-	rc = grb::clear( dst );
-	if( rc == SUCCESS ) {
-		rc = grb::setElement( mask, true, n / 2 );
-	}
+	rc = grb::setElement( mask, true, n / 2 );
 	if( rc == SUCCESS ) {
 		rc = grb::set( src, 1.5 );
+		rc = rc ? rc : grb::wait( src );
 	}
 	if( rc == SUCCESS ) {
 		rc = grb::set< grb::descriptors::invert_mask >( dst, mask, src );
+		rc = rc ? rc : grb::wait( dst );
 	}
 	if( rc != SUCCESS ) {
-		std::cerr << "\t Sparse inverted-mask set FAILED with error code " << grb::toString( rc ) << "\n";
+		std::cerr << "\t Sparse inverted-mask set FAILED with error code "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( nnz( dst ) != n - 1 ) {
-			std::cerr << "\t (sparse-inverted-mask-set) unexpected number of "
-						 "nonzeroes "
-					  << nnz( dst ) << ", expected " << ( n - 1 ) << ".\n";
+			std::cerr << "\t (sparse-inverted-mask-set) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected " << (n - 1) << ".\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : dst ) {
+		for( const auto &pair : dst ) {
 			if( pair.first == n / 2 ) {
-				std::cerr << "\t (sparse-inverted-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << " ): this position should have been empty\n";
+				std::cerr << "\t (sparse-inverted-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "this position should have been empty\n";
 				rc = FAILED;
 			} else if( pair.second != 1.5 ) {
-				std::cerr << "\t (sparse-inverted-mask-set) unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 1.5.\n";
+				std::cerr << "\t (sparse-inverted-mask-set) unexpected entry "
+					<< "( " << pair.first << ", " << pair.second << " ), "
+					<< "expected value 1.5.\n";
 				rc = FAILED;
 			}
 		}
 	}
-	// if( rc != SUCCESS ) { return; }
+	if( rc != SUCCESS ) {
+		return;
+	}
+
+	// test set-to-clear
+	rc = grb::clear( src );
+	if( rc == SUCCESS ) {
+		rc = grb::set( dst, src );
+		rc = rc ? rc : grb::wait( dst );
+	}
+	if( rc != SUCCESS ) {
+		std::cerr << "\t Set to empty vector FAILED with error code "
+			<< grb::toString( rc ) << "\n";
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << "\t (set-to-empty) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 0.\n";
+			rc = FAILED;
+		}
+		for( const auto &pair : dst ) {
+			std::cerr << "\t (set-to-empty) unexpected entry "
+				<< "( " << pair.first << ", " << pair.second << " ), "
+				<< "this position should have been empty\n";
+			rc = FAILED;
+		}
+	}
+
+	// test double-set-to-clear
+	rc = grb::set( dst, src );
+	rc = rc ? rc : grb::wait( dst );
+	if( rc != SUCCESS ) {
+		std::cerr << "\t Set to empty vector FAILED with error code "
+			<< grb::toString( rc ) << "\n";
+	} else {
+		if( nnz( dst ) != 0 ) {
+			std::cerr << "\t (set-to-empty) unexpected number of nonzeroes "
+				<< nnz( dst ) << ", expected 0.\n";
+			rc = FAILED;
+		}
+		for( const auto &pair : dst ) {
+			std::cerr << "\t (set-to-empty) unexpected entry "
+				<< "( " << pair.first << ", " << pair.second << " ), "
+				<< "this position should have been empty\n";
+			rc = FAILED;
+		}
+	}
+
+	// test behaviour under dense descriptor
+	rc = dense_tests( dst, src );
 
 	// done
 	return;
@@ -371,8 +844,8 @@ int main( int argc, char ** argv ) {
 	}
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
-		std::cerr << "  -n (optional, default is 100): an even integer, the "
-					 "test size.\n";
+		std::cerr << "  -n (optional, default is 100): an even integer, "
+			<< "the test size.\n";
 		return 1;
 	}
 
@@ -390,3 +863,4 @@ int main( int argc, char ** argv ) {
 	}
 	return 0;
 }
+
diff --git a/tests/unit/sparse_vxm.cpp b/tests/unit/sparse_vxm.cpp
index 66eb777e4..74893d9f3 100644
--- a/tests/unit/sparse_vxm.cpp
+++ b/tests/unit/sparse_vxm.cpp
@@ -138,7 +138,7 @@ static enum grb::RC checkResult( const grb::Vector< double > &left, const grb::V
 		ret = grb::eWiseLambda( [ &diff, &right ]( const size_t i ) {
 				diff[ i ] = std::abs( diff[ i ] - right[ i ] );
 			},
-			right
+			right, diff
 		);
 	}
 	std::cout << "Difference vector (" << nnz( diff ) << "/" << size( diff ) << ") reads:\n";
diff --git a/tests/unit/static_asserts/properties.cpp b/tests/unit/static_asserts/properties.cpp
new file mode 100644
index 000000000..2384d63d3
--- /dev/null
+++ b/tests/unit/static_asserts/properties.cpp
@@ -0,0 +1,31 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <graphblas.hpp>
+
+
+int main() {
+	constexpr const bool b  = grb::Properties<>::isBlockingExecution;
+	constexpr const bool nb = grb::Properties<>::isNonblockingExecution;
+	static_assert( b == !nb,
+		"A backend must either be blocking or nonblocking" );
+	// A backend must define the writableCaptured property
+	constexpr const bool w  = grb::Properties<>::writableCaptured;
+	(void) w;
+	return 0;
+}
+
diff --git a/tests/unit/stdMatrix.cpp b/tests/unit/stdMatrix.cpp
index cc13ed992..3de7a51da 100644
--- a/tests/unit/stdMatrix.cpp
+++ b/tests/unit/stdMatrix.cpp
@@ -23,6 +23,7 @@
 
 using namespace grb;
 
+
 void grb_program( const size_t & n, grb::RC & rc ) {
 	std::vector< grb::Matrix< unsigned char > > matrices;
 	size_t * const I = new size_t[ n ];
@@ -93,21 +94,25 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 	}
 	for( size_t i = 0; i < 13; ++i ) {
 		if( grb::nnz( matrices[ i ] ) != n ) {
-			std::cerr << "\t unexpected number of nonzeroes at matrix " << i << ": " << grb::nnz( matrices[ i ] ) << ", expected " << n << "\n";
+			std::cerr << "\t unexpected number of nonzeroes at matrix " << i << ": "
+				<< grb::nnz( matrices[ i ] ) << ", expected " << n << "\n";
 			rc = FAILED;
 		}
 	}
 	for( size_t i = 0; i < 13; ++i ) {
 		for( const auto &nonzero : matrices[ i ] ) {
 			if( nonzero.first.first / 2 != nonzero.first.second ) {
-				std::cerr << "\t unexpected value at position ( " << nonzero.first.first << ", " << nonzero.first.second << " )\n";
+				std::cerr << "\t unexpected value at position ( " << nonzero.first.first
+					<< ", " << nonzero.first.second << " )\n";
 			}
 			const unsigned char chk = i % 2 == 0 ?
 				static_cast< unsigned char >( 2 * nonzero.first.first ) :
 				static_cast< unsigned char >( 2 * nonzero.first.first + i );
 			if( nonzero.second != chk ) {
-				std::cerr << "\t unexpected value at entry ( " << nonzero.first.first << ", " << nonzero.first.second << " ) = "
-					<< static_cast< size_t >( nonzero.second ) << " ) of matrix " << i << "; expected " << (2*nonzero.first.first+i)
+				std::cerr << "\t unexpected value at entry ( " << nonzero.first.first
+					<< ", " << nonzero.first.second << " ) = "
+					<< static_cast< size_t >( nonzero.second ) << " ) of matrix " << i << "; "
+					<< "expected " << (2*nonzero.first.first+i)
 					<< " as value\n";
 				rc = FAILED;
 			}
diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh
index c0264cca1..d2dc97965 100755
--- a/tests/unit/unittests.sh
+++ b/tests/unit/unittests.sh
@@ -104,7 +104,9 @@ for MODE in debug ndebug; do
 		else
 			Ps=( 1 )
 		fi
-		if [ "$BACKEND" = "reference_omp" ] ; then
+		if [ "$BACKEND" = "reference_omp" ]; then
+			Pt=( 1 2 ${MAX_THREADS} )
+		elif [ "$BACKEND" = "nonblocking" ]; then
 			Pt=( 1 2 ${MAX_THREADS} )
 		elif [ "$BACKEND" = "hybrid" ]; then
 			MTDS=$((MAX_THREADS/7))
@@ -194,6 +196,13 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/pinnedVector_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
+				echo "                                 of doubles of size 14."
+				$runner ${TEST_BIN_DIR}/ewiseapply_${MODE}_${BACKEND} 14 &> ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_small_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
 				echo "                                 of doubles of size 100."
 				$runner ${TEST_BIN_DIR}/ewiseapply_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/ewiseapply_${MODE}_${BACKEND}_${P}_${T}
@@ -201,12 +210,6 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using + on matrices"
-				$runner ${TEST_BIN_DIR}/eWiseApply_matrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T}
-				head -1 ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T}
-				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
-				echo " "
-
 				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using (+,0) on vectors"
 				echo "                                 of doubles of size 10 000 000."
 				$runner ${TEST_BIN_DIR}/ewiseapply_${MODE}_${BACKEND} 10000000 &> ${TEST_OUT_DIR}/ewiseapply_large_${MODE}_${BACKEND}_${P}_${T}
@@ -214,13 +217,6 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/ewiseapply_large_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
 				echo " "
 
-				echo ">>>      [x]           [ ]       Testing grb::zip on two vectors of doubles and"
-				echo "                                 ints of size 10 000 000."
-				$runner ${TEST_BIN_DIR}/zip_${MODE}_${BACKEND} 10000000 &> ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T}
-				head -1 ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T}
-				grep 'Test OK' ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
-				echo " "
-
 				echo ">>>      [x]           [x]       Testing grb::foldl and grb::foldr reducing dense"
 				echo "                                 vectors into scalars using operators and monoids."
 				$runner ${TEST_BIN_DIR}/fold_to_scalar_${MODE}_${BACKEND} ${P} &> ${TEST_OUT_DIR}/fold_to_scalar_${MODE}_${BACKEND}_${P}_${T}.log
@@ -298,6 +294,13 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/matrixIterator_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing double-assignment of ALP/GraphBLAS containers, i.e.,"
+				echo "                                 assigning one container another one (a=b), twice in a row."
+				$runner ${TEST_BIN_DIR}/doubleAssign_${MODE}_${BACKEND} 1337 &> ${TEST_OUT_DIR}/doubleAssign_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/doubleAssign_${MODE}_${BACKEND}_${P}_${T}.log
+				grep -i 'test ok' ${TEST_OUT_DIR}/doubleAssign_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
 				echo ">>>      [x]           [ ]       Testing copy and move constructors and assignment"
 				echo "                                 of the const_iterator of grb::Vector< double > of"
 				echo "                                 length 10 000 000."
@@ -306,6 +309,19 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/copyAndAssignVectorIterator_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::eWiseMul on a vector of"
+				echo "                                 doubles of size 100."
+				$runner ${TEST_BIN_DIR}/eWiseMul_${MODE}_${BACKEND} 100 &> ${TEST_OUT_DIR}/eWiseMul_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/eWiseMul_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseMul_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
+				echo ">>>      [x]           [ ]       Testing grb::eWiseMul on a vector of"
+				echo "                                 doubles of size 100002."
+				$runner ${TEST_BIN_DIR}/eWiseMul_${MODE}_${BACKEND} 100002 &> ${TEST_OUT_DIR}/eWiseMul_large_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/eWiseMul_large_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseMul_large_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::eWiseMulAdd on a vector of"
 				echo "                                 doubles of size 7 000 000."
@@ -352,21 +368,21 @@ for MODE in debug ndebug; do
 				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::argmin"
-				$runner ${TEST_BIN_DIR}/argmin_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/argmin_${MODE}_${BACKEND}_${P}_${T}.err
+				$runner ${TEST_BIN_DIR}/argmin_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/argmin_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/argmin_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/argmin_${MODE}_${BACKEND}_${P}_${T}.log
+				grep "Test OK" ${TEST_OUT_DIR}/argmin_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::argmax"
-				$runner ${TEST_BIN_DIR}/argmax_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/argmax_${MODE}_${BACKEND}_${P}_${T}.err
+				$runner ${TEST_BIN_DIR}/argmax_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/argmax_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/argmax_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/argmax_${MODE}_${BACKEND}_${P}_${T}.log
+				grep "Test OK" ${TEST_OUT_DIR}/argmax_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
 				echo ">>>      [x]           [ ]       Testing grb::set (matrices)"
-				$runner ${TEST_BIN_DIR}/matrixSet_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.err
-				echo " "
-
-				echo ">>>      [x]           [ ]       Testing grb::eWiseLambda (matrices)"
-				$runner ${TEST_BIN_DIR}/eWiseMatrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
-				head -1 ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
-				grep 'Test OK' ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				$runner ${TEST_BIN_DIR}/matrixSet_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log
+				echo "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
 				echo ">>>      [x]           [ ]       Tests the \`level-0' grb::collectives"
@@ -512,8 +528,34 @@ for MODE in debug ndebug; do
 				grep 'Test OK' ${TEST_OUT_DIR}/buildMatrixUnique_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
 				echo " "
 
+				echo ">>>      [x]           [ ]       Testing grb::eWiseApply using + on matrices"
+				$runner ${TEST_BIN_DIR}/eWiseApply_matrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseApply_matrix_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
+				echo ">>>      [x]           [ ]       Testing grb::eWiseLambda (matrices)"
+				$runner ${TEST_BIN_DIR}/eWiseMatrix_${MODE}_${BACKEND} &> ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/eWiseMatrix_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
+
+				echo ">>>      [x]           [ ]       Testing grb::zip on two vectors of doubles and"
+				echo "                                 ints of size 10 000 000."
+				$runner ${TEST_BIN_DIR}/zip_${MODE}_${BACKEND} 10000000 &> ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/zip_large_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
+				echo ">>>      [x]           [ ]       Testing copy-constructor of square pattern matrices"
+				echo "                                 of size 1003."
+				$runner ${TEST_BIN_DIR}/copyVoidMatrices_${MODE}_${BACKEND} 1003 &> ${TEST_OUT_DIR}/copyVoidMatrices_${MODE}_${BACKEND}_${P}_${T}
+				head -1 ${TEST_OUT_DIR}/copyVoidMatrices_${MODE}_${BACKEND}_${P}_${T}
+				grep 'Test OK' ${TEST_OUT_DIR}/copyVoidMatrices_${MODE}_${BACKEND}_${P}_${T} || echo "Test FAILED"
+				echo " "
+
 				if [ "$BACKEND" = "bsp1d" ] || [ "$BACKEND" = "hybrid" ]; then
-					echo "Additional standardised unit tests not yet supported for the ${BACKEND} backend"
+					echo "Additional standardised unit tests not yet supported for the ${BACKEND} backend."
 					echo
 					continue
 				fi
@@ -535,12 +577,18 @@ for MODE in debug ndebug; do
 				echo ">>>      [x]           [ ]       Testing vector times matrix using the normal (+,*)"
 				echo "                                 semiring over integers on a diagonal matrix"
 				echo " "
-				$runner ${TEST_BIN_DIR}/vmx_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/vmx_${MODE}_${BACKEND}_${P}_${T}.err
+				$runner ${TEST_BIN_DIR}/vmx_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/vmx_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/vmx_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/vmx_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/vmx_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
 
 				echo ">>>      [x]           [ ]       Testing vector times matrix using a (*,+) semiring over"
 				echo "                                 doubles on a diagonal matrix"
 				echo " "
-				$runner ${TEST_BIN_DIR}/vmxa_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/vmxa_${MODE}_${BACKEND}_${P}_${T}.err
+				$runner ${TEST_BIN_DIR}/vmxa_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/vmxa_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/vmxa_${MODE}_${BACKEND}_${P}_${T}.log
+				head -1 ${TEST_OUT_DIR}/vmxa_${MODE}_${BACKEND}_${P}_${T}.log
+				grep 'Test OK' ${TEST_OUT_DIR}/vmxa_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED"
+				echo " "
 
 				echo ">>>      [x]           [ ]       Testing vector times matrix using the number (+,*)"
 				echo "                                 semiring over integers on a diagonal 15x15 matrix. Each"
diff --git a/tests/unit/vmx.cpp b/tests/unit/vmx.cpp
index 9af67a0a4..4400be957 100644
--- a/tests/unit/vmx.cpp
+++ b/tests/unit/vmx.cpp
@@ -93,26 +93,26 @@ int main( int argc, char ** argv ) {
 	}
 
 	// check contents of x
-	const int * __restrict__ const xraw = x.raw();
-	for( size_t i = 0; !error && i < 15; ++i ) {
-		if( !grb::utils::equals( data1[ i ], xraw[ i ] ) ) {
+	for( const std::pair< size_t, int > &pair : x ) {
+		if( !grb::utils::equals( data1[ pair.first ], pair.second ) ) {
 			std::cerr << "Initialisation error: vector x, "
-				<< "element at position " << i << ": "
-				<< xraw[ i ] << " does not equal "
-				<< data1[ i ] << "." << std::endl;
+				<< "element at position " << pair.first << ": "
+				<< pair.second << " does not equal "
+				<< data1[ pair.first ] << "." << std::endl;
 			error = 20;
+			break;
 		}
 	}
 
 	// check contents of y
-	const int * __restrict__ const against = y.raw();
-	for( size_t i = 0; !error && i < 15; ++i ) {
-		if( !grb::utils::equals( 0, against[ i ] ) ) {
+	for( const std::pair< size_t, int > &pair : y ) {
+		if( !grb::utils::equals( 0, pair.second ) ) {
 			std::cerr << "Initialisation error: vector y, "
-			       << "element at position " << i << ": "
-			       << "0 does not equal " << against [ i ]
+			       << "element at position " << pair.first << ": "
+			       << "0 does not equal " << pair.second
 			       << "." << std::endl;
 			error = 6;
+			break;
 		}
 	}
 
@@ -142,12 +142,13 @@ int main( int argc, char ** argv ) {
 	}
 
 	// check
-	for( size_t i = 0; !error && i < 15; ++i ) {
-		if( !grb::utils::equals( chk[ i ], against[ i ] ) ) {
+	for( const std::pair< size_t, int > &pair : y ) {
+		if( !grb::utils::equals( chk[ pair.first ], pair.second ) ) {
 			std::cerr << "Output vector element mismatch at position "
-				<< i << ": " << chk[ 1 ] << "does not equal "
-				<< against[ i ] << "." << std::endl;
+				<< pair.first << ": " << chk[ pair.first ] << "does not equal "
+				<< pair.second << "." << std::endl;
 			error = 9;
+			break;
 		}
 	}
 
diff --git a/tests/unit/vmxa.cpp b/tests/unit/vmxa.cpp
index e3f94eb4d..030117a3f 100644
--- a/tests/unit/vmxa.cpp
+++ b/tests/unit/vmxa.cpp
@@ -88,12 +88,12 @@ void alpProgram( const grb::RC &rc_in, int &error ) {
 	}
 
 	// check
-	const double * __restrict__ const against = y.raw();
-	for( size_t i = 0; i < 15; ++i ) {
-		if( !grb::utils::equals( chk[ i ], against[ i ], 1 ) ) {
-			std::cerr << "Output vector element mismatch at position " << i << ": "
-				<< chk[ i ] << " does not equal " << against[ i ] << ".\n";
+	for( const std::pair< size_t, double > &pair : y ) {
+		if( !grb::utils::equals( chk[ pair.first ], pair.second, 1 ) ) {
+			std::cerr << "Output vector element mismatch at position " << pair.first << ": "
+				<< chk[ pair.first ] << " does not equal " << pair.second << ".\n";
 			error = 8;
+			break;
 		}
 	}
 
diff --git a/tests/unit/zip.cpp b/tests/unit/zip.cpp
index c36f05745..c469ffc1f 100644
--- a/tests/unit/zip.cpp
+++ b/tests/unit/zip.cpp
@@ -20,10 +20,14 @@
 
 #include <graphblas.hpp>
 
+
 using namespace grb;
 
 void grb_program( const size_t & n, grb::RC & rc ) {
-	grb::Semiring< grb::operators::add< double >, grb::operators::mul< double >, grb::identities::zero, grb::identities::one > ring;
+	grb::Semiring<
+		grb::operators::add< double >, grb::operators::mul< double >,
+		grb::identities::zero, grb::identities::one
+	> ring;
 	grb::Vector< double > left( n ), chk1( n );
 	grb::Vector< int > right( n ), chk2( n );
 	grb::Vector< std::pair< double, int > > out( n );
@@ -42,13 +46,17 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		return;
 	}
 	if( nnz( out ) != n ) {
-		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( out ) << ", expected " << n << " )\n";
+		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( out )
+			<< ", expected " << n << " )\n";
 		rc = FAILED;
 	}
-	for( const auto & pair : out ) {
+	for( const auto &pair : out ) {
 		const std::pair< double, int > & out = pair.second;
 		if( out.first != 1.5 || out.second != 2 ) {
-			std::cerr << "\t unexpected output ( " << pair.first << ", < " << out.first << ", " << out.second << " > ), expected " << pair.first << ", < 1.5, 2 > )\n";
+			std::cerr << "\t unexpected output "
+				<< "( " << pair.first << ", < " << out.first << ", "
+				<< out.second << " > ), expected " << pair.first << ", "
+				<< "< 1.5, 2 > )\n";
 			rc = FAILED;
 		}
 	}
@@ -62,22 +70,26 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		return;
 	}
 	if( nnz( chk1 ) != n ) {
-		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( chk1 ) << ", expected " << n << "\n";
+		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( chk1 ) << ", "
+			<< "expected " << n << "\n";
 		rc = FAILED;
 	}
 	if( nnz( chk2 ) != n ) {
-		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( chk2 ) << ", expected " << n << "\n";
+		std::cerr << "\t unexpected number of nonzeroes ( " << nnz( chk2 ) << ", "
+			<< "expected " << n << "\n";
 		rc = FAILED;
 	}
-	for( const auto & pair : chk1 ) {
+	for( const auto &pair : chk1 ) {
 		if( pair.second != 1.5 ) {
-			std::cerr << "\t unexpected output ( " << pair.first << ", " << pair.second << " ), expected " << pair.first << ", 1.5 )\n";
+			std::cerr << "\t unexpected output ( " << pair.first << ", " << pair.second
+				<< " ), expected " << pair.first << ", 1.5 )\n";
 			rc = FAILED;
 		}
 	}
-	for( const auto & pair : chk2 ) {
+	for( const auto &pair : chk2 ) {
 		if( pair.second != 2 ) {
-			std::cerr << "\t unexpected output ( " << pair.first << ", " << pair.second << " ), expected " << pair.first << ", 2 )\n";
+			std::cerr << "\t unexpected output ( " << pair.first << ", " << pair.second
+				<< " ), expected " << pair.first << ", 2 )\n";
 			rc = FAILED;
 		}
 	}
@@ -111,48 +123,56 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = grb::zip( A, I, J, V );
 	}
 	if( rc != SUCCESS ) {
-		std::cout << "grb::zip to matrix (non-void) FAILED with error " << grb::toString( rc ) << "\n";
+		std::cout << "grb::zip to matrix (non-void) FAILED with error "
+			<< grb::toString( rc ) << "\n";
 	} else {
 		if( grb::nnz( A ) != n ) {
-			std::cout << "\t got " << grb::nnz( A ) << " matrix nonzeroes, expected " << n << "\n";
+			std::cout << "\t got " << grb::nnz( A ) << " matrix nonzeroes, "
+				<< "expected " << n << "\n";
 			rc = FAILED;
 		}
 		// check via grb::mxv
-		(void)grb::set( right, 1 );
-		(void)grb::clear( left );
-		(void)grb::vxm( left, right, A, ring );
+		(void) grb::set( right, 1 );
+		(void) grb::clear( left );
+		(void) grb::vxm( left, right, A, ring );
 		if( grb::nnz( left ) != 2 ) {
-			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, expected 2\n";
+			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, "
+				<< "expected 2\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : left ) {
+		for( const auto &pair : left ) {
 			if( pair.first == 1 ) {
 				const double expect = n - 1 + ( 1 == n / 2 ? 1 : 0 );
 				if( pair.second != expect ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected " << expect << "\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected " << expect << "\n";
 					rc = FAILED;
 				}
 			} else if( pair.first == n / 2 && n / 2 != 1 ) {
 				if( pair.second != 1 ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected 1\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected 1\n";
 					rc = FAILED;
 				}
 			} else {
 				if( pair.second != 0 ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected none\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected none\n";
 					rc = FAILED;
 				}
 			}
 		}
-		(void)grb::clear( left );
-		(void)grb::mxv( left, A, right, ring );
+		(void) grb::clear( left );
+		(void) grb::mxv( left, A, right, ring );
 		if( grb::nnz( left ) != n ) {
-			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, expected " << n << "\n";
+			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, "
+				<< "expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : left ) {
+		for( const auto &pair : left ) {
 			if( pair.second != 1 ) {
-				std::cout << "\t got unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 1.\n";
+				std::cout << "\t got unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ), expected value 1.\n";
 				rc = FAILED;
 			}
 		}
@@ -167,48 +187,56 @@ void grb_program( const size_t & n, grb::RC & rc ) {
 		rc = grb::zip( A_void, I, J );
 	}
 	if( rc != SUCCESS ) {
-		std::cout << "grb::zip to matrix (void) FAILED with error " << grb::toString( rc ) << "\n";
+		std::cout << "grb::zip to matrix (void) FAILED with error " << grb::toString( rc )
+			<< "\n";
 	} else {
 		if( grb::nnz( A_void ) != n ) {
-			std::cout << "\t got " << grb::nnz( A_void ) << " matrix nonzeroes, expected " << n << "\n";
+			std::cout << "\t got " << grb::nnz( A_void ) << " matrix nonzeroes, "
+				<< "expected " << n << "\n";
 			rc = FAILED;
 		}
 		// check via grb::mxv
-		(void)grb::set( right, 1 );
-		(void)grb::clear( left );
-		(void)grb::vxm( left, right, A_void, ring );
+		(void) grb::set( right, 1 );
+		(void) grb::clear( left );
+		(void) grb::vxm( left, right, A_void, ring );
 		if( grb::nnz( left ) != 2 ) {
-			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, expected 2\n";
+			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, "
+				<< "expected 2\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : left ) {
+		for( const auto &pair : left ) {
 			if( pair.first == 1 ) {
 				const double expect = n - 1 + ( 1 == n / 2 ? 1 : 0 );
 				if( pair.second != expect ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected " << expect << "\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected " << expect << "\n";
 					rc = FAILED;
 				}
 			} else if( pair.first == n / 2 && n / 2 != 1 ) {
 				if( pair.second != 1 ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected 1\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected 1\n";
 					rc = FAILED;
 				}
 			} else {
 				if( pair.second != 0 ) {
-					std::cout << "\t got " << pair.second << " nonzeroes in column " << pair.first << ", expected none\n";
+					std::cout << "\t got " << pair.second << " nonzeroes in column "
+						<< pair.first << ", expected none\n";
 					rc = FAILED;
 				}
 			}
 		}
-		(void)grb::clear( left );
-		(void)grb::mxv( left, A_void, right, ring );
+		(void) grb::clear( left );
+		(void) grb::mxv( left, A_void, right, ring );
 		if( grb::nnz( left ) != n ) {
-			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, expected " << n << "\n";
+			std::cout << "\t got " << grb::nnz( left ) << " nonzeroes in output vector, "
+				<< "expected " << n << "\n";
 			rc = FAILED;
 		}
-		for( const auto & pair : left ) {
+		for( const auto &pair : left ) {
 			if( pair.second != 1 ) {
-				std::cout << "\t got unexpected entry ( " << pair.first << ", " << pair.second << " ): expected value 1.\n";
+				std::cout << "\t got unexpected entry ( " << pair.first << ", "
+					<< pair.second << " ): expected value 1.\n";
 				rc = FAILED;
 			}
 		}
@@ -250,7 +278,7 @@ int main( int argc, char ** argv ) {
 	if( printUsage ) {
 		std::cerr << "Usage: " << argv[ 0 ] << " [n]\n";
 		std::cerr << "  -n (optional, default is 100): an even integer, the "
-					 "test size.\n";
+			"test size.\n";
 		return 1;
 	}
 
@@ -262,9 +290,11 @@ int main( int argc, char ** argv ) {
 		return 255;
 	}
 	if( out != SUCCESS ) {
-		std::cerr << "Test FAILED (" << grb::toString( out ) << ")" << std::endl;
+		std::cerr << "Test FAILED (" << grb::toString( out ) << ")\n"
+			<< std::endl;
 	} else {
-		std::cout << "Test OK" << std::endl;
+		std::cout << "Test OK\n" << std::endl;
 	}
 	return 0;
 }
+
diff --git a/tests/utils/matrix_generators.hpp b/tests/utils/matrix_generators.hpp
index 1c0f6c980..be45890c6 100644
--- a/tests/utils/matrix_generators.hpp
+++ b/tests/utils/matrix_generators.hpp
@@ -91,8 +91,8 @@ namespace grb {
 		 * Computes the index of the last parallel nonzero + 1 (i.e., exclusive).
 		 *
 		 * Local nonzeroes are thus in the range
-		 * 	[ compute_parallel_first_nonzero( num_nonzeroes ) ,
-		 * 		compute_parallel_last_nonzero( num_nonzeroes ) )
+		 *  [ compute_parallel_first_nonzero( num_nonzeroes ),
+		 *    compute_parallel_last_nonzero( num_nonzeroes ) ]
 		 */
 		template< typename T >
 		T compute_parallel_last_nonzero( const T num_nonzeroes ) {
diff --git a/tests/utils/matrix_values_check.hpp b/tests/utils/matrix_values_check.hpp
index 344245e4b..9e076b176 100644
--- a/tests/utils/matrix_values_check.hpp
+++ b/tests/utils/matrix_values_check.hpp
@@ -232,7 +232,7 @@ namespace grb {
 		 * @param log_all_differences whether to print all differences
 		 *
 		 * @return true if all nonzeroes are equal, i.e. have the same number of
-		 * 	elements, in the same order and with equal values
+		 *              elements, in the same order and with equal values
 		 * @return false if any of the above conditions is not met
 		 */
 		template<
diff --git a/tests/utils/output_verification.hpp b/tests/utils/output_verification.hpp
index 8f212dedb..531695129 100644
--- a/tests/utils/output_verification.hpp
+++ b/tests/utils/output_verification.hpp
@@ -30,15 +30,20 @@
 #include <limits>
 #include <cmath>
 #include <complex>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
 
 #include <assert.h>
 
 
-
 /**
- * Attempts to read in a double value from a given file into a given memory
+ * Attempts to read in a value from a given file into a given memory
  * location.
  *
+ * @tparam T The datatype of the value
+ *
  * @param[in]  in  The input file
  * @param[out] out Where to store the read value.
  *
@@ -46,22 +51,19 @@
  *
  * If the function fails, \a out shall not be assigned.
  *
- * \internal This is the overload for reading double data.
+ * \internal This is the overload for reading T data.
  */
-template< typename fileType >
-int data_fscanf( const fileType in, double * const out ) {
-	const int rc = fscanf( in, "%lf", out );
-	if( rc == 1 ) {
-		return 0;
-	} else {
-		return 1;
-	}
+template< typename T >
+int data_fscanf( std::ifstream &in, T * const out ) {
+	return !(in >> *out);
 };
 
 /**
  * Attempts to read in a complex value from a given file into a given memory
  * location.
  *
+ * @tparam T The data type to be used in the complex value
+ *
  * @param[in]  in  The input file
  * @param[out] out Where to store the read value.
  *
@@ -71,11 +73,10 @@ int data_fscanf( const fileType in, double * const out ) {
  *
  * \internal This is the overload for reading complex data.
  */
-template< typename fileType, typename T >
-int data_fscanf( const fileType in, std::complex< T > * const out ) {
-	double x, y;
-	const int rc = fscanf( in, "%lf%lf", &x, &y );
-	if( rc == 2 ) {
+template< typename T >
+int data_fscanf( std::ifstream &in, std::complex< T > * const out ) {
+	T x, y;
+	if( in >> x >> y ) {
 		*out = std::complex< T >( x, y );
 		return 0;
 	} else {
@@ -99,35 +100,36 @@ int data_fscanf( const fileType in, std::complex< T > * const out ) {
  *                           verification.
  *
  * @returns  0 if verification succeeded
- * @returns 10 if the ground truth file could not be opened
- * @returns 20 on memory allocation errors while reading the ground truth file
- * @returns 30 on I/O errors on the ground truth file
- * @returns 40 on memory allocation errors for verification buffers
- * @returns 50 if the \a output_vector was not dense
- * @returns 51 if inf-norm verification failed
- * @returns 52 if both 50 and 51 apply
- * @returns 53 if the computation of the inf-norm failed
- * @returns 54 if both 53 and 50 apply
- * @returns 55 if both 53 and 51 apply
- * @returns 56 if all of 53, 51, and 50 apply
- * @returns 57 if the computation of the 2-norm failed
- * @returns 58 if both 57 and 50 apply
- * @returns 59 if both 57 and 51 apply
- * @returns 60 if all of 57, 51, and 50 apply
- * @returns 61 if both 57 and 53 apply
- * @returns 62 if all of 57, 53, and 50 apply
- * @returns 63 if all of 57, 53, and 51 apply
- * @returns 64 if all of 57, 53, 51, and 50 apply
- * @returns 65 if 2-norm verification failed
+ * @returns  1 if the \a output_vector was not dense
+ * @returns  2 if inf-norm verification failed
+ * @returns  3 if both 1 and 2 apply
+ * @returns  4 if the computation of the inf-norm failed
+ * @returns  5 if both 4 and 1 apply
+ * @returns  6 if both 4 and 2 apply
+ * @returns  7 if all of 4, 2, and 1 apply
+ * @returns  8 if the computation of the 2-norm failed
+ * @returns  9 if both 8 and 1 apply
+ * @returns 10 if both 8 and 2 apply
+ * @returns 11 if all of 8, 2, and 1 apply
+ * @returns 12 if both 8 and 4 apply
+ * @returns 13 if all of 8, 4, and 1 apply
+ * @returns 14 if all of 8, 4, and 2 apply
+ * @returns 15 if all of 8, 4, 2, and 1 apply
+ * @returns 16 if 2-norm verification failed
  * ...
- * @returns 80 if all of 65, 57, 53, 51, and 50 apply
+ * @returns 31 if all of 16, 8, 4, 2, and 1 apply
+ *
+ * \note Please note that error codes 0, 1, 2, 4, 8, and 16
+ *       correspond to individual errors this function can detect. Errors can be
+ *       detected simultaneously, which leads to the other error codes that are
+ *       not all exhaustively enumerated in the above. The mixed error codes are
+ *       systematic by power-of-two offsets.
  *
- * \note Please note that error codes 0, 10, 20, 30, 40, 50, 51, 53, 57, and 65
- *       correspond to individual errors this function can detect. Errors from
- *       number 50 (inclusive) onwards can be detected simultaneously, which
- *       leads to the other error codes that are not all exhaustively
- *       enumerated in the above. The mixed error codes are systematic by
- *       power-of-two offsets.
+ * @throws  runtime_error if the ground truth file could not be opened
+ * @throws  bad_alloc on memory allocation errors while reading the ground truth
+ *          file
+ * @throws  runtime_error on I/O errors on the ground truth file
+ * @throws  bad_alloc on memory allocation errors for verification buffers
  */
 template< typename T, enum grb::Backend B >
 int vector_verification(
@@ -141,12 +143,14 @@ int vector_verification(
 	const constexpr T one = static_cast< T >( 1 );
 
 	// open verification file
-	FILE * const in = fopen( truth_filename, "r" );
+	std::ifstream in;
+	in.open( truth_filename);
 
-	if( in == nullptr ) {
-		std::cerr << "Could not open the file \"" << truth_filename << "\"."
+	if( !in.is_open() ) {
+		std::stringstream error;
+		error << "Could not open the file \"" << truth_filename << "\"."
 			<< std::endl;
-		return 10;
+		throw std::runtime_error(error.str());
 	}
 
 	// read the truth output vector from the input verification file
@@ -154,23 +158,22 @@ int vector_verification(
 	T * const truth = new T[ n ];
 	if( truth == nullptr ) {
 		std::cerr << "Could not allocate necessary buffer" << std::endl;
-		return 20;
+		throw std::bad_alloc();
 	}
 
 	for( size_t i = 0; i < n; i++ ) {
 		const int rc = data_fscanf( in, truth + i );
 		if( rc != 0 ) {
-			std::cerr << "The verification file looks incomplete. " << "Line i = " << i
+			std::stringstream error;
+			error << "The verification file looks incomplete. " << "Line i = " << i
 				<< ", data = " << truth[ i ] << ", rc = " << rc << std::endl;
 			delete [] truth;
-			return 30;
+			throw std::runtime_error(error.str());
 		}
 	}
 
 	// close verification file
-	if( fclose( in ) != 0 ) {
-		std::cerr << "I/O warning: closing verification file failed." << std::endl;
-	}
+	in.close();
 
 	// compute magnitudes
 	double magnitude2 = 0;
@@ -191,7 +194,7 @@ int vector_verification(
 	if( raw_output_vector == nullptr || written_to == nullptr ) {
 		std::cerr << "Could not allocate necessary buffers" << std::endl;
 		delete [] truth;
-		return 40;
+		throw std::bad_alloc();
 	}
 	for( size_t i = 0; i < n; i++ ) {
 		written_to[ i ] = false;
@@ -320,11 +323,6 @@ int vector_verification(
 			<< "effective relative tolerance of " << (c2 * magnitudeInf + eps) << "\n";
 	}
 
-	// apply error code offset (if there was an error)
-	if( ret > 0 ) {
-		ret += 49;
-	}
-
 	// done
 	return ret;
 }
diff --git a/tools/detectSuspiciousSpacing.sh b/tools/detectSuspiciousSpacing.sh
new file mode 100755
index 000000000..db0531594
--- /dev/null
+++ b/tools/detectSuspiciousSpacing.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+#
+#   Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Author: A. N. Yzelman
+# Date: 6th of October, 2022
+
+echo "Detecting suspicious spacing errors in the current directory, `pwd`"
+printf "\t spaces, followed by end-of-line...\n"
+find . -type f | xargs grep -I ' $'
+printf "\t tabs, followed by end-of-line...\n"
+find . -type f | xargs grep -IP '\t$'
+printf "\t spaces followed by a tab...\n"
+find . -type f | xargs grep -IP ' \t'
+
diff --git a/tools/downloadDatasets.sh b/tools/downloadDatasets.sh
index 2585ae103..e67e6ac06 100755
--- a/tools/downloadDatasets.sh
+++ b/tools/downloadDatasets.sh
@@ -39,16 +39,17 @@ downloadSN () {
 
 DATASETS_DIR="$(pwd)/datasets"
 
-echo "This script will download matrices from the SuiteSparse Matrix Collection [1], which"
-echo "is maintained by Tim Davis, Yifan Hu, and Scott Kolodziej. It also downloads one"
-echo "matrix from the SNAP collection maintained by Jure Leskovec [2]."
+echo "This script will download matrices from the SuiteSparse Matrix Collection [1], which "
+echo "is maintained by Tim Davis, Yifan Hu, and Scott Kolodziej. It also downloads two "
+echo "matrices from the SNAP collection maintained by Jure Leskovec [2]."
 echo " "
 echo "The matrices downloaded from the SuiteSparse Matrix Collection are:"
 echo " - west0497, Chemical Process Simulation Problem [3]"
 echo " - gyro_m, Duplicate Model Reduction Problem [4,5]"
 echo " - dwt_59, Structural Problem [3]"
+echo " - EPA, Web Link Matrix [8]"
 echo " "
-echo "The matrix downloaded from SNAP is:"
+echo "The matrices downloaded from SNAP are:"
 echo " - cit-HepTh, high energy physics theory citation graph [6]"
 echo " - facebook_combined, social network circles [7]"
 echo " "
@@ -68,11 +69,16 @@ echo "[6] J. Leskovec, J. Kleinberg and C. Faloutsos. 2005. Graphs over Time: De
 echo "    Laws, Shrinking Diameters and Possible Explanations."
 echo "    ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)"
 echo "[7] J. McAuley and J. Leskovec. 2012. Learning to Discover Social Circles in Ego"
-echo      "Networks. NIPS."
+echo "     Networks. NIPS."
+echo "[8] V. Batagelj and A. Mrvar, Pajek datasets (Creative Commons BY-NC-SA 2.5),"
+echo "    http://vlado.fmf.uni-lj.si/pub/networks/data/, 2006."
 echo " "
-echo "Please take note of the attributions to SuiteSparse, west0497, and gyro_m. Please"
-echo "ensure the download you initiate is in line with applicable terms of use, laws, and"
-echo "regulations, and please use the script once and keep the datasets for future use."
+echo "Please take note of the attributions to SuiteSparse, west0497, gyro_m, and EPA."
+echo " "
+echo "Please ensure the download you initiate is in line with applicable terms of use, "
+echo "laws, and regulations."
+echo " "
+echo "Please use this script once and keep the datasets for future use."
 echo ""
 echo "On confirmation, the datasets will be downloaded to: ${DATASETS_DIR}"
 echo ""
@@ -86,6 +92,7 @@ if [[ "$REPLY" = "yes" ]]; then
 	downloadSS "west0497" "https://suitesparse-collection-website.herokuapp.com/MM/HB/west0497.tar.gz"
 	downloadSS "gyro_m" "https://suitesparse-collection-website.herokuapp.com/MM/Oberwolfach/gyro_m.tar.gz"
 	downloadSS "dwt_59" "https://suitesparse-collection-website.herokuapp.com/MM/HB/dwt_59.tar.gz"
+	downloadSS "EPA" "https://suitesparse-collection-website.herokuapp.com/MM/Pajek/EPA.tar.gz"
 	downloadSN "cit-HepTh"
 	downloadSN "facebook_combined"
 	echo ""