diff --git a/.github/workflows/dbg_smoke.yml b/.github/workflows/dbg_smoke.yml
new file mode 100644
index 000000000..8fb2795ce
--- /dev/null
+++ b/.github/workflows/dbg_smoke.yml
@@ -0,0 +1,42 @@
+
+name: debug-smoke-tests
+
+on: [push]
+
+env:
+ BUILD_TYPE: Debug
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install required packages
+ run: sudo apt-get install -y libnuma-dev
+
+ - name: Configure
+ run: mkdir build && cd build && ../bootstrap.sh --prefix=../install --debug-build
+
+ - name: Build
+ working-directory: ${{github.workspace}}/build
+ run: make -j4
+
+ - name: Install
+ working-directory: ${{github.workspace}}/build
+ run: make -j4 install
+
+ - name: Test
+ working-directory: ${{github.workspace}}/build
+ run: make -j4 smoketests &> smoketests.log
+
+ - name: Check
+ working-directory: ${{github.workspace}}/build
+ run: ../tests/summarise.sh smoketests.log
+
+ - name: DumpLogOnFailure
+ if: failure()
+ working-directory: ${{github.workspace}}/build
+ run: cat smoketests.log
+
diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml
index c302ebdb2..884b2f74f 100644
--- a/.github/workflows/smoke.yml
+++ b/.github/workflows/smoke.yml
@@ -1,5 +1,5 @@
-name: smoke-tests
+name: release-smoke-tests
on: [push]
@@ -35,3 +35,8 @@ jobs:
working-directory: ${{github.workspace}}/build
run: ../tests/summarise.sh smoketests.log
+ - name: DumpLogOnFailure
+ if: failure()
+ working-directory: ${{github.workspace}}/build
+ run: cat smoketests.log
+
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c6a81c040..60b3410d6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,7 +30,7 @@ stages:
# exclude:
# - build/**/*.o
# - build/**/*.o.d
-# expire_in: 30 minutes
+# expire_in: 80 minutes
#build_debug_centos_8:
@@ -122,13 +122,16 @@ build_test:
- apt update && apt -y install make cmake libnuma-dev coreutils
script:
- mkdir -p install build && cd ./build && ../bootstrap.sh --prefix=../install && make -j$(nproc) build_tests_all
+ - strip -s $(find tests/unit/ -type f -executable -print) $(find tests/smoke/ -type f -executable -print) $(find tests/performance/ -type f -executable -print)
artifacts:
paths:
- build/
exclude:
- build/**/*.o
- build/**/*.o.d
- expire_in: 30 minutes
+ - build/**/CMakeFiles
+ - build/**/*.dir
+ expire_in: 80 minutes
build_debug2_tests:
@@ -222,7 +225,7 @@ build_debug:
exclude:
- build/**/*.o
- build/**/*.o.d
- expire_in: 30 minutes
+ expire_in: 43 minutes
test_smoke_debug:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7dc72dd2..344216e50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@
cmake_minimum_required( VERSION 3.13 )
set( MAJORVERSION 0 )
-set( MINORVERSION 6 )
+set( MINORVERSION 7 )
set( BUGVERSION 0 )
set( VERSION "${MAJORVERSION}.${MINORVERSION}.${BUGVERSION}" )
@@ -51,6 +51,8 @@ endif()
# to choose backends and dependencies
option( WITH_REFERENCE_BACKEND "With Reference backend" ON )
option( WITH_OMP_BACKEND "With OMP backend" ON )
+option( WITH_HYPERDAGS_BACKEND "With Hyperdags backend" ON )
+option( WITH_NONBLOCKING_BACKEND "With Nonblocking backend" ON )
option( WITH_NUMA "With NUMA support" ON )
option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF )
# the following options depend on LPF_INSTALL_PATH being set
@@ -61,6 +63,9 @@ LPF_INSTALL_PATH set)" ON LPF_INSTALL_PATH OFF
cmake_dependent_option( WITH_HYBRID_BACKEND "Also build the Hybrid backend \
(needs LPF_INSTALL_PATH set)" ON LPF_INSTALL_PATH OFF
)
+# other dependent options
+cmake_dependent_option( WITH_HYPERDAGS_BACKEND "Building the Hyperdags backend needs \
+ WITH_HYPERDAGS_USING set" ON WITH_HYPERDAGS_USING OFF )
# to customize build flags for either backends or tests
option( COMMON_COMPILE_DEFINITIONS
"Compilation definitions for BOTH backends and tests; they override the defaults"
@@ -117,6 +122,7 @@ endif()
if( NOT WITH_REFERENCE_BACKEND AND
NOT WITH_OMP_BACKEND AND
+ NOT WITH_NONBLOCKING_BACKEND AND
NOT WITH_BSP1D_BACKEND AND
NOT WITH_HYBRID_BACKEND )
message( FATAL_ERROR "At least one backend should be enabled")
@@ -188,13 +194,18 @@ endif()
# by default no headers are built
set( WITH_REFERENCE_BACKEND_HEADERS OFF )
set( WITH_OMP_BACKEND_HEADERS OFF )
+set( WITH_HYPERDAGS_BACKEND_HEADERS OFF )
# activate headers based on requested backends
-if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND )
- # both reference and bsp1d backends need reference headers
+if( WITH_REFERENCE_BACKEND OR WITH_BSP1D_BACKEND OR WITH_NONBLOCKING_BACKEND )
+ # reference, bsp1d and nonblocking backends need reference headers
set( WITH_REFERENCE_BACKEND_HEADERS ON )
endif()
+if( WITH_HYPERDAGS_BACKEND )
+ set( WITH_HYPERDAGS_BACKEND_HEADERS ON )
+endif()
+
if( WITH_OMP_BACKEND OR WITH_HYBRID_BACKEND )
# both reference_omp and hynrid backends need reference headers
set( WITH_OMP_BACKEND_HEADERS ON )
@@ -218,13 +229,28 @@ add_subdirectory( examples )
### DOXYGEN DOCUMENTATION GENERATION
-set( DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/code" )
+set( DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/developer" )
add_custom_command( OUTPUT "${DOCS_DIR}"
- COMMAND bash -c "if [[ ! -d docs/code ]]; then doxygen docs/doxy.conf &> doxygen.log; fi"
+ COMMAND bash -c "doxygen docs/doxy.conf &> doxygen-developer.log;"
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
DEPENDS "${PROJECT_SOURCE_DIR}/docs/doxy.conf"
COMMENT "producing code documentation in ${DOCS_DIR}"
VERBATIM
#USES_TERMINAL
)
-add_custom_target( docs DEPENDS "${DOCS_DIR}" )
+add_custom_target( devdocs DEPENDS "${DOCS_DIR}" )
+
+set( PUBLIC_DOCS_DIR "${PROJECT_SOURCE_DIR}/docs/user" )
+add_custom_command( OUTPUT "${PUBLIC_DOCS_DIR}"
+ COMMAND bash -c "doxygen docs/user.conf &> doxygen-user.log;"
+ WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+ DEPENDS "${PROJECT_SOURCE_DIR}/docs/user.conf"
+ COMMENT "producing public code documentation in ${PUBLIC_DOCS_DIR}"
+ VERBATIM
+)
+add_custom_target( userdocs DEPENDS "${PUBLIC_DOCS_DIR}" )
+add_custom_target( docs )
+add_dependencies( docs userdocs devdocs )
+
+message( "Compiling with the following backends: ${AVAILABLE_BACKENDS}\n" )
+
diff --git a/NOTICE b/NOTICE
index 3f1bf625d..3c370eca4 100644
--- a/NOTICE
+++ b/NOTICE
@@ -29,6 +29,8 @@ to Huawei Technologies Co., Ltd. or one of its subsidiaries:
- Auke Booij, Huawei Technologies Switzerland AG; 2021.
+ - Anders Hansson, Huawei Technologies Switzerland AG; 2022-2023.
+
The experimental banshee backend has been developed in collaboration with
Prof. Luca Benini at ETH Zuerich and his group. In particular this backend
is with great thanks due to Dan, Paul Scheffler, Fabian Schuiki, and Samuel
diff --git a/README.md b/README.md
index ae65c9547..ff0b89d1e 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
- _____ .____ __________ /\ ________ .__ __________.____ _____ _________
- / _ \ | | \______ \ / / / _____/___________ ______ | |__\______ \ | / _ \ / _____/
- / /_\ \| | | ___/ / / / \ __\_ __ \__ \ \____ \| | \| | _/ | / /_\ \ \_____ \
-/ | \ |___| | / / \ \_\ \ | \// __ \| |_> > Y \ | \ |___/ | \/ \
-\____|__ /_______ \____| / / \______ /__| (____ / __/|___| /______ /_______ \____|__ /_______ /
- \/ \/ \/ \/ \/|__| \/ \/ \/ \/ \/
+ _____ .____ __________
+ / _ \ | | \______ \
+ / /_\ \| | | ___/
+/ | \ |___| |
+\____|__ /_______ \____|
+ \/ \/
Copyright 2021 Huawei Technologies Co., Ltd.
@@ -22,6 +22,31 @@ limitations under the License.
+This distribution contains the C++ Algebraic Programming (ALP) framework, and
+provides the ALP/GraphBLAS, ALP/Pregel, and Sparse BLAS programming interfaces.
+Only a subset of Sparse BLAS functionality is supported, at present.
+
+This distribution contains ALP backends that generate:
+ - sequential programs,
+ - shared-memory auto-parallelised programs,
+ - nonblocking shared-memory auto-parallelised programs, and
+ - sequential programs that generate HyperDAG representations of the executed
+ ALP program.
+
+Additional backends may optionally be enabled by providing their dependences.
+Those backends generate:
+ - distributed-memory auto-parallelised programs,
+ - hybrid shared- and distributed-memory auto-parallelised programs, and
+ - sequential programs for the Banshee RISC-V Snitch Core simulator
+ (experimental).
+
+All backends perform automatically generate vectorised programs, amongst other
+automatically-applied optimisations.
+
+The ALP/GraphBLAS and ALP/Pregel interfaces are enabled for all backends, while
+the standard Sparse BLAS APIs only allow for the efficient support of the
+sequential and shared-memory parallel backends.
+
# Minimal requirements
@@ -31,7 +56,7 @@ libraries and programs, using its `reference` and `reference_omp` backends.
## Compilation
-To compile ALP/GraphBLAS, you need the following tools:
+To compile ALP, you need the following tools:
1. A C++11-capable compiler such as GCC 4.8.2 or higher, with OpenMP support
2. LibNUMA development headers
@@ -40,7 +65,7 @@ To compile ALP/GraphBLAS, you need the following tools:
(CMake's default build tool on UNIX systems) or any other supported build tool.
## Linking and run-time
-The ALP/GraphBLAS libraries link against the following libraries:
+The ALP libraries link against the following libraries:
1. LibNUMA: `-lnuma`
2. Standard math library: `-lm`
@@ -60,15 +85,15 @@ of the LPF core library and its collectives library. The LPF library has its
further dependences, which are all summarised on the LPF project page:
* [Gitee](https://gitee.com/CSL-ALP/lpf);
-* [Github](https://github.com/Algebraic-Programming/LPF).
+* [GitHub](https://github.com/Algebraic-Programming/LPF).
-The dependence on LPF applies to compilation, linking, and run-time. Fulfulling
+The dependence on LPF applies to compilation, linking, and run-time. Fulfilling
the dependence enables the `bsp1d` and `hybrid` ALP/GraphBLAS backends.
## Code documentation
For generating the code documentations:
-* `doyxgen` reads code comments and generates the documentation;
+* `doxygen` reads code comments and generates the documentation;
* `graphviz` generates various diagrams for inheritance, call paths, etc.;
* `pdflatex` is required to build the PDF file out of the Latex generated
documentation.
@@ -76,12 +101,12 @@ For generating the code documentations:
# Very quick start
-Here are example steps to compile and install ALP/GraphBLAS for shared-memory
-machines, without distributed-memory support. The last three commands show-case
-the compilation and execution of the `sp.cpp` example program.
+Here are example steps to compile and install ALP for shared-memory machines
+without distributed-memory support. The last three commands show-case the
+compilation and execution of the `sp.cpp` example program.
```bash
-cd
+cd
mkdir build
cd build
../bootstrap.sh --prefix=../install
@@ -101,35 +126,37 @@ In more detail, the steps to follow are:
that `config::SIMD_SIZE::bytes` defined in that file is set correctly with
respect to the target architecture.
-2. Create an empty directory for building ALP/GraphBLAS and move into it:
+2. Create an empty directory for building ALP and move into it:
`mkdir build && cd build`.
-3. Invoke the `bootstrap.sh` script located inside the ALP/GraphBLAS root directory
- `` to generate the build infrastructure via CMake inside the
- current directory:
+3. Invoke the `bootstrap.sh` script located inside the ALP root directory
+ `` to generate the build infrastructure via CMake inside the
+ the current directory:
- `/bootstrap.sh --prefix=`
+ `/bootstrap.sh --prefix=`
- note: add `--with-lpf=/path/to/lpf/install/dir` if you have LPF installed
and would like to use it.
-4. Issue `make -j` to compile the C++11 ALP/GraphBLAS library for the configured
- backends.
+4. Issue `make -j` to compile the C++11 ALP library for the configured backends.
5. (*Optional*) To later run all unit tests, several datasets must be made
- available. Please run the `/tools/downloadDatasets.sh`
+ available. Please run the `/tools/downloadDatasets.sh`
script for
a. an overview of datasets required for the basic tests, as well as
b. the option to automatically download them.
-6. (*Optional*) To make the ALP/GraphBLAS documentation, issue `make docs`. This
+6. (*Optional*) To make the ALP documentation, issue `make userdocs`. This
generates both
- a. a PDF in `/docs/code/latex/refman.pdf`, and
+ a. LaTeX in `/docs/user/latex/refman.tex`, and
+
+ b. HTML in `/docs/user/html/index.html`.
- b. HTML in `/docs/code/html/index.html`.
+ To build a PDF from the LaTeX sources, cd into the directory mentioned, and
+ issue `make`.
7. (*Optional*) Issue `make -j smoketests` to run a quick set of functional
tests. Please scan the output for any failed tests.
@@ -138,20 +165,20 @@ In more detail, the steps to follow are:
the default command lines the tests script uses are likely wrong. In this
case, please edit `tests/parse_env.sh` by searching for the MPI
implementation you used, and uncomment the lines directly below each
- occurance.
+ occurrence.
8. (*Optional*) Issue `make -j unittests` to run an exhaustive set of unit
tests. Please scan the output for any failed tests.
If you do this with LPF enabled, please edit `tests/parse_env.sh` if required
as described in step 5.
-9. Issue `make -j install` to install ALP/GraphBLAS into your
-install directory configured during step 1.
+9. Issue `make -j install` to install ALP into the install directory configured
+ during step 1.
-10. (*Optional*) Issue `source /bin/setenv` to make available the
-`grbcxx` and `grbrun` compiler wrapper and runner.
+10. (*Optional*) Issue `source /bin/setenv` to make
+ available the `grbcxx` and `grbrun` compiler wrapper and runner.
-Congratulations, you are now ready for developing and integrating ALP/GraphBLAS
+Congratulations, you are now ready for developing and integrating ALP
algorithms! Any feedback, question, problem reports are most welcome at
@@ -161,10 +188,12 @@ algorithms! Any feedback, question, problem reports are most welcome at
# Additional Contents
-The remainder of this file summarises other build system targets, how to
-integrate ALP algorithms into applications, debugging, development, and,
-finally, acknowledges contributors and lists technical papers.
+The remainder of this file summarises configuration options, additional build
+system targets, how to integrate ALP programs into applications, debugging, and
+contribute to ALP development. Finally, this README acknowledges contributors
+and lists technical papers.
+- [Configuration](#configuration)
- [Overview of the main Makefile targets](#overview-of-the-main-makefile-targets)
- [Automated performance testing](#automated-performance-testing)
- [Integrating ALP with applications](#integrating-alp-with-applications)
@@ -181,7 +210,97 @@ finally, acknowledges contributors and lists technical papers.
- [Debugging](#debugging)
- [Development in ALP](#development-in-alp)
- [Acknowledgements](#acknowledgements)
-- [Citing ALP and ALP/GraphBLAS](#citing-alp-and-alpgraphblas)
+- [Citing ALP, ALP/GraphBLAS, and ALP/Pregel](#citing-alp-alpgraphblas-and-alppregel)
+
+
+# Configuration
+
+ALP employs configuration headers that contain `constexpr` settings that take
+effect every time ALP programs are compiled. Multiple object files that were
+compiled using ALP must all been compiled using the same configuration
+settings-- linking objects that have been compiled with a mixture of
+configurations are likely to incur undefined behaviour. The recommendation is
+to set a configuration before building and installing ALP, and to keep the
+installation directories read-only so that configurations remain static.
+
+There exists one main configuration file that affects all ALP backends, while
+other configuration files only affect a specific backend or only affect specific
+classes of backends. The main configuration file is found in
+`
/include/graphblas/base/config.hpp`, which allows one to set the
+
+1. cache line size, in bytes, within the `CACHE_LINE_SIZE` class;
+2. SIMD width, in bytes, within the `SIMD_SIZE` class;
+3. default number of experiment repetitions during benchmarking, within the
+ `BENCHMARKING` class;
+4. L1 data cache size, in bytes, within `MEMORY::big_memory` class;
+5. from which size onwards memory allocations will be reported, in log-2
+ bytes, within `MEMORY::big_memory`;
+6. index type used for row coordinates, as the `RowIndexType` typedef;
+7. index type used for column coordinates, as the `ColIndexType` typedef;
+8. type used for indexing nonzeroes, as the `NonzeroIndexType` typedef;
+9. index type used for vector coordinates, as the `VectorIndexType` typedef.
+
+Other configuration values in this file are automatically inferred, are fixed
+non-configurable settings, or are presently not used by any ALP backend.
+
+## Reference and reference_omp backends
+
+The file `include/graphblas/reference/config.hpp` contain defaults that pertain
+to the auto-vectorising and sequential `reference` backend, but also to the
+shared-memory auto-parallelising `reference_omp` backend. It allows one to set
+
+1. whether prefetching is enabled in `PREFETCHING::enabled`;
+2. the prefetch distance in `PREFETCHING::distance`;
+3. the default memory allocation strategy for thread-local data in
+ `IMPLEMENTATION::defaultAllocMode()`;
+4. same, but for shared data amongst threads in
+ `IMPLEMENTATION::sharedAllocMode()`;
+
+Modifying any of the above should be done with utmost care as it typically
+affects the defaults across an ALP installation, and *all* programs compiled
+using it. Configuration elements not mentioned here should not be touched by
+users, and rather should concern ALP developers only.
+
+## OpenMP backends
+
+The file `include/graphblas/omp/config.hpp` contains some basic configuration
+parameters that affect any OpenMP-based backend. However, the configuration
+file does not contain any other user-modifiable settings, but rather contains
+a) some utilities that OpenMP-based backends may rely on, and b) default
+that are derived from other settings described in the above. These settings
+should only be overridden with compelling and expert knowledge.
+
+## LPF backends
+
+The file `include/graphblas/bsp/config.hpp` contains some basic configuration
+parameters that affect any LPF-based backend. It includes:
+
+1. an initial maximum of LPF memory slot registrations in `LPF::regs()`;
+2. an initial maximum of LPF messages in `LPF::maxh()`.
+
+These defaults, if insufficient, will be automatically resized during execution.
+Setting these large enough will therefore chiefly prevent buffer resizes at run-
+time. Modifying these should normally not lead to significant performance
+differences.
+
+## Utilities
+
+The file `include/graphblas/utils/config.hpp` details configurations of various
+utility functions, including:
+
+1. a buffer size used during reading input files, in `PARSER::bsize()`;
+2. the block size of individual reads in `PARSER::read_bsize()`.
+
+These defaults are usually fine except when reading from SSDs, which would
+benefit of a larger `read_bsize`.
+
+## Others
+
+While there are various other configuration files (find `config.hpp`), the above
+should list all user-modifiable configuration settings of interest. The
+remainder pertain to configurations that are automatically deduced from the
+aforementioned settings, or pertain to settings that describe how to safely
+compose backends and thus only are of interest to ALP developers.
# Overview of the main Makefile targets
@@ -190,7 +309,8 @@ The following table lists the main build targets of interest:
| Target | Explanation |
|----------------------:|---------------------------------------------------|
-| \[*default*\] | builds the ALP/GraphBLAS libraries and examples |
+| \[*default*\] | builds the ALP libraries and examples, including |
+| | Sparse BLAS libraries generated by ALP |
| `install` | install libraries, headers and some convenience |
| | scripts into the path set via `--prefix=` |
| `unittests` | builds and runs all available unit tests |
@@ -198,7 +318,12 @@ The following table lists the main build targets of interest:
| `perftests` | builds and runs all available performance tests |
| `tests` | builds and runs all available unit, smoke, and |
| | performance tests |
-| `docs` | builds HTML and LaTeX code and API documentation |
+| `userdocs` | builds HTML and LaTeX documentation corresponding |
+| | to the public ALP API |
+| `devdocs` | builds HTML and LaTeX code documentation for |
+| | developers of the ALP internals |
+| `docs` | build both the user and developer code |
+| | documentation |
For more information about the testing harness, please refer to the
[related documentation](tests/Tests.md).
@@ -209,21 +334,20 @@ refer to the [the related documentation](docs/Build_and_test_infra.md).
# Automated performance testing
-To check in-depth performance of this ALP/GraphBLAS implementation, issue
-`make -j perftests`. This will run several algorithms in several ALP/GraphBLAS
+To check in-depth performance of this ALP implementation, issue
+`make -j perftests`. This will run several algorithms in several ALP
configurations. This generates three main output files:
-1. `/tests/performance/output`, which summarises the
- whole run;
+1. `/tests/performance/output`, which summarises the whole run;
-2. `/tests/performance/output/benchmarks`, which
- summarises the performance of individual algorithms; and
+2. `/tests/performance/output/benchmarks`, which summarises the
+ performance of individual algorithms; and
-3. `/tests/performance/output/scaling`, which
- summarises operator scaling results.
+3. `/tests/performance/output/scaling`, which summarises operator
+ scaling results.
-To ensure that all tests run, please ensure all related datasets are available
-as also described at step 5 of the quick start.
+To ensure that all tests run, please ensure that all related datasets are
+available, as also described at step 5 of the quick start.
With LPF enabled, please note the remark described at steps 3 and 7 of the quick
start guide. If LPF was not configured using MPICH, please review and apply any
@@ -232,24 +356,28 @@ necessary changes to `tests/performance/performancetests.sh`.
# Integrating ALP with applications
-There are several use cases in which ALP can be deployed and utilized, listed
-in the following. These assume that the user has installed ALP/GraphBLAS in a
-dedicated directory via `make install`.
+There are several use cases in which ALP can be deployed and utilised, listed
+in the following. These assume that the user has installed ALP in a dedicated
+directory via `make install`.
## Running ALP programs as standalone executables
### Implementation
The `grb::Launcher< AUTOMATIC >` class abstracts a group of user processes that
-should collaboratively execute any single ALP/GraphBLAS program. The
-ALP/GraphBLAS program of interest must have the following signature:
-`void grb_program( const T& input_data, U& output_data )`.
+should collaboratively execute any single ALP program. The ALP program of
+interest must have the following signature:
+
+```
+void grb_program( const T& input_data, U& output_data )
+```
+
The types `T` and `U` can be any plain-old-data (POD) type, including structs --
these can be used to broadcast input data from the master process to all user
processes (`input_data`) -- and for data to be sent back on exit of the parallel
-ALP/GraphBLAS program.
+ALP program.
-The above sending-and-receiving across processes applies only to ALP/GraphBLAS
+The above sending-and-receiving across processes applies only to ALP
implementations and backends that support or require multiple user processes;
both the sequential `reference` and the shared-memory parallel `reference_omp`
backends, for example, support only one user process.
@@ -258,11 +386,11 @@ In case of multiple user processes, the overhead of the broadcasting of input
data is linear in the number of user processes, as well as linear in the byte-
size of `T` which hence should be kept to a minimum. A recommended use of this
mechanism is, e.g., to broadcast input data locations; any additional I/O
-should use the parallel I/O mechanisms that ALP/GraphBLAS exposes to the ALP
-program itself.
+should use the parallel I/O mechanisms that ALP exposes to the ALP program
+itself.
Output data is retrieved only from the user process with ID `0`, even if
-multiple user processes exist. Some implemenations or systems may require
+multiple user processes exist. Some implementations or systems may require
sending back the output data to a calling process, even if there is only
one user process. The data movement cost incurred should hence be considered
linear in the byte size of `U`, and, similar to the input data broadcasting,
@@ -287,60 +415,67 @@ your programs using the ALP installation, the following flags are recommended:
Omitting these flags for brevity, some compilation examples follow.
-When using the LPF-enabled hybrid shared- and distributed-memory backend of
-ALP/GraphBLAS, simply use
+When using the LPF-enabled hybrid shared- and distributed-memory ALP backends,
```bash
grbcxx -b hybrid
```
-as the compiler command. To show all flags that the wrapper passes on, please use
+
+as the compiler command. To show all flags that the wrapper passes on, please
+use
```bash
grbcxx -b hybrid --show
```
+
and append your regular compilation arguments.
-The `hybrid` backend is capable of spawning multiple ALP/GraphBLAS user
-processes. In contrast, compilation using
+The `hybrid` backend is capable of spawning multiple ALP user processes. In
+contrast, compilation using
```bash
grbcxx -b reference
```
+
produces a sequential binary, while
```bash
grbcxx -b reference_omp
```
+
produces a shared-memory parallel binary.
-Note that the ALP/GraphBLAS source code never requires change while switching
-backends.
+Note that the ALP source code never requires change while switching backends.
### Linking
-The executable must be statically linked against an ALP/GraphBLAS library that
-is different depending on the selected backend.
+The executable must be statically linked against an ALP library that is
+different depending on the selected backend.
The compiler wrapper `grbcxx` takes care of all link-time dependencies
automatically.
-When using the LPF-enabled BSP1D backend to ALP/GraphBLAS, for example, simply
-use `grbcxx -b bsp1d` as the compiler/linker command.
+When using the LPF-enabled BSP1D backend to ALP, for example, simply use
+`grbcxx -b bsp1d` as the compiler/linker command.
+
Use
```bash
grbcxx -b bsp1d --show
```
+
to show all flags that the wrapper passes on.
### Running
The resulting program has run-time dependencies that are taken care of by the
-LPF runner `lpfrun` or by the ALP/GraphBLAS runner `grbrun`.
+LPF runner `lpfrun` or by the ALP runner `grbrun`.
+
We recommend using the latter:
```bash
grbrun -b hybrid -np
```
-Here, `P` is the number of requested ALP/GraphBLAS user processes.
+
+Here, `P` is the number of requested ALP user processes.
### Threading
@@ -350,18 +485,18 @@ on a single node, the `reference_omp` backend may be selected instead.
In both cases, make sure that during execution the `OMP_NUM_THREADS` and
`OMP_PROC_BIND` environment variables are set appropriately on each node that
-executes ALP/GraphBLAS user process(es).
+executes ALP user process(es).
## Running parallel ALP programs from existing parallel contexts
This, instead of automatically spawning a requested number of user processes,
assumes a number of processes already exist and that we wish those processes to
-jointly execute a single parallel ALP/GraphBLAS program.
+jointly execute a single parallel ALP program.
### Implementation
-The binary that contains the ALP/GraphBLAS program to be executed must define
-the following global symbol with the given value:
+The binary that contains the ALP program to be executed must define the
+following global symbol with the given value:
```c++
const int LPF_MPI_AUTO_INITIALIZE = 0
@@ -377,19 +512,19 @@ grb::Launcher< MANUAL > launcher( s, P, hostname, portname )
```
Here, `P` is the total number of processes that should jointly execute a
-parallel ALP/GraphBLAS program, while `0 <= s < P` is a unique ID of this
-process amongst its `P`-1 siblings.
-The types of `s` and `P` are `size_t`, i.e., unsigned integers.
+parallel ALP program, while `0 <= s < P` is a unique ID of this process amongst
+its `P`-1 siblings. The types of `s` and `P` are `size_t`, i.e., unsigned
+integers.
One of these processes must be selected as a connection broker prior to forming
-a group of ALP/GraphBLAS user processes. The remainder `P-1` processes must
-first connect to the chosen broker using TCP/IP connections. This choice must
-be made outside of ALP/GraphBLAS, prior to setting up the launcher, and
-materialises as the `hostname` and `portname` Launcher constructor arguments.
-The host and port name are strings, and must be equal across all processes.
+a group of ALP user processes. The remainder `P-1` processes must first connect
+to the chosen broker using TCP/IP connections. This choice must be made outside
+of ALP, prior to setting up the launcher, and materialises as the `hostname` and
+`portname` Launcher constructor arguments. The host and port name are strings,
+and must be equal across all processes.
As before, and after the successful construction of a manual launcher instance,
-a parallel ALP/GraphBLAS program is launched via
+a parallel ALP program is launched via
```c++
grb::Launcher< MANUAL >::exec( &grb_program, input, output )
@@ -398,25 +533,24 @@ grb::Launcher< MANUAL >::exec( &grb_program, input, output )
in exactly the same way as described earlier, though with the input and output
arguments now being passed in a one-to-one fashion:
1. The input data is passed on from the original process to exactly one
- corresponding ALP/GraphBLAS user process; i.e., no broadcast occurs. The
- original process and the ALP/GraphBLAS user process are, from an operating
- system point of view, the same process. Therefore, and additionally, input
- no longer needs to be a plain-old-data (POD) type. Pointers, for example,
- are now perfectly valid to pass along, and enable sharing data between the
- original process and the ALP/GraphBLAS algorithm.
- 2. The output data is passed from each ALP/GraphBLAS user process to the
- original process that called `Launcher< MANUAL >::exec`. To share
- ALP/GraphBLAS vector data, it is, for example, legal to return a
- `grb::PinnedVector< T >` as the `exec` output argument type. Doing so is
- akin to returning a pointer to output data, and does not explicitly pack
- nor transmit vector data.
+ corresponding ALP user process; i.e., no broadcast occurs. The original
+ process and the ALP user process are, from an operating system point of
+ view, the same process. Therefore, and additionally, input no longer needs
+ to be a plain-old-data (POD) type. Pointers, for example, are now perfectly
+ valid to pass along, and enable sharing data between the original process
+ and the ALP algorithm.
+ 2. The output data is passed from each ALP user process to the original
+ process that called `Launcher< MANUAL >::exec`. To share ALP vector data,
+ it is, for example, legal to return a `grb::PinnedVector< T >` as the
+ `exec` output argument type. Doing so is akin to returning a pointer to
+ output data, and does not explicitly pack nor transmit vector data.
### Running
The pre-existing process must have been started using an external mechanism.
This mechanism must include run-time dependence information that is normally
-passed by the ALP/GraphBLAS runner whenever a distributed-memory parallel
-backend is selected.
+passed by the ALP runner whenever a distributed-memory parallel backend is
+selected.
If the external mechanism by which the original processes are started allows it,
this is most easily effected by using the standard `grbcxx` launcher while
@@ -444,14 +578,14 @@ to add ALP and ALP/GraphBLAS as a dependence to your project.
# Debugging
-To debug an ALP/GraphBLAS program, please compile it using the sequential
-reference backend and use standard debugging tools such as `valgrind` and `gdb`.
+To debug an ALP program, please compile it using the sequential reference
+backend and use standard debugging tools such as `valgrind` and `gdb`.
Additionally, please ensure to *not* pass the `-DNDEBUG` flag during
compilation.
If bugs appear in one backend but not another, it is likely you have found a bug
-in the former backend implementation. Please send a minimum working example that
-demonstrates the bug to the maintainers, either as an issue on or an email to:
+in the former backend. Please send a minimum working example that demonstrates
+the bug to the maintainers, either as an issue on or an email to:
1. [GitHub](https://github.com/Algebraic-Programming/ALP/issues);
2. [Gitee](https://gitee.com/CSL-ALP/graphblas/issues);
3. [Albert-Jan](mailto:albertjan.yzelman@huawei.com).
@@ -459,8 +593,8 @@ demonstrates the bug to the maintainers, either as an issue on or an email to:
# Development in ALP
-Your contributions to ALP/GraphBLAS would be most welcome. Merge or Pull Requests
-(MRs/PRs) can be contributed via Gitee and GitHub. See above for the links.
+Your contributions to ALP would be most welcome. Merge Requests (MRs) can be
+contributed via Gitee and GitHub; see above for the links.
For the complete development documentation, you should start from the
[docs/README file](docs/README.md) and the related
@@ -470,10 +604,10 @@ For the complete development documentation, you should start from the
# Acknowledgements
The LPF communications layer was primarily authored by Wijnand Suijlen, without
-whom the current ALP/GraphBLAS would not be what it is now.
+whom the current ALP would not be what it is now.
-The collectives library and its interface to the ALP/GraphBLAS was primarily
-authored by Jonathan M. Nash.
+The collectives library and its interface to the ALP was primarily authored by
+Jonathan M. Nash.
The testing infrastructure that performs smoke, unit, and performance testing of
sequential, shared-memory parallel, and distributed-memory parallel backends was
@@ -485,17 +619,30 @@ Computing Systems Laboratory in Zürich in particular. See the [NOTICE](NOTICE)
file for individual contributors.
-# Citing ALP and ALP/GraphBLAS
+# Citing ALP, ALP/GraphBLAS, and ALP/Pregel
+
+If you use ALP in your work, please consider citing one or more of the following
+papers, as appropriate.
-If you use ALP/GraphBLAS in your work, please consider citing one or more of the
-following papers, as appropriate:
+## ALP and ALP/GraphBLAS
- [A C++ GraphBLAS: specification, implementation, parallelisation, and evaluation](http://albert-jan.yzelman.net/PDFs/yzelman20.pdf)
by A. N. Yzelman, D. Di Nardo, J. M. Nash, and W. J. Suijlen (2020).
Pre-print.
[Bibtex](http://albert-jan.yzelman.net/BIBs/yzelman20.bib).
- - [Nonblocking execution in GraphBLAS](http://albert-jan.yzelman.net/PDFs/mastoras22-pp.pdf)
- by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman (2022).
- Pre-print.
+ - [Nonblocking execution in GraphBLAS](https://ieeexplore.ieee.org/document/9835271)
+ by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman
+ in IEEE International Parallel and Distributed Processing Symposium
+ Workshops, 2022.
[Bibtex](http://albert-jan.yzelman.net/BIBs/mastoras22.bib).
+ - [Design and implementation for nonblocking execution in GraphBLAS: tradeoffs and performance](https://dl.acm.org/doi/10.1145/3561652)
+ by Aristeidis Mastoras, Sotiris Anagnostidis, and A. N. Yzelman
+ in ACM Transactions on Architecture and Code Optimization 20(1), 2023.
+ [Bibtex](http://albert-jan.yzelman.net/BIBs/mastoras22a.bib).
+
+## ALP/Pregel
+
+ - [Humble Heroes](http://albert-jan.yzelman.net/PDFs/yzelman22-pp.pdf)
+ by A. N. Yzelman (2022). Pre-print.
+ [Bibtex](http://albert-jan.yzelman.net/BIBs/yzelman22.bib).
diff --git a/bootstrap.sh b/bootstrap.sh
index 89b865a15..8acfdfa58 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -63,7 +63,7 @@ validate_command_result() {
print_help() {
echo "Usage: $0 --prefix= [--with-lpf[=]]\
- [--with-banshee=] [--with-snitch=] [--no-reference] [--debug-build] [--generator=] [--show] [--delete-files]"
+ [--with-banshee=] [--with-snitch=] [--no-reference] [--no-nonblocking] [--debug-build] [--generator=] [--show] [--delete-files]"
echo " "
echo "Required arguments:"
echo " --prefix= "
@@ -74,6 +74,11 @@ the location where LPF is installed"
echo " --with-banshee= - path to the the tools to compile the banshee backend"
echo " --with-snitch= - path to the tools for Snitch support within the banshee backend"
echo " --no-reference - disables the reference and reference_omp backends"
+ echo " --no-hyperdags - disables the hyperdags backend"
+ echo " --with-hyperdags-using= - uses the given backend reference for HyperDAG generation"
+ echo " optional; default value is reference"
+ echo " clashes with --no-hyperdags"
+ echo " --no-nonblocking - disables the nonblocking backend"
echo " --debug-build - build the project with debug options (tests will run much slower!)"
echo " --generator= - set the generator for CMake (otherwise use CMake's default)"
echo " --show - show generation commands instead of running them"
@@ -90,6 +95,9 @@ the location where LPF is installed"
}
reference=yes
+hyperdags=yes
+hyperdags_using=reference
+nonblocking=yes
banshee=no
lpf=no
show=no
@@ -146,6 +154,16 @@ or assume default paths (--with-lpf)"
--no-reference)
reference=no
;;
+ --no-hyperdags)
+ hyperdags=no
+ ;;
+ --with-hyperdags-using=*)
+ hyperdags=yes
+ hyperdags_using="${arg#--with-hyperdags-using=}"
+ ;;
+ --no-nonblocking)
+ nonblocking=no
+ ;;
--debug-build)
debug_build=yes
;;
@@ -202,6 +220,19 @@ if [[ "${reference}" == "yes" || "${lpf}" == "yes" ]]; then
check_cc_cpp_comp
fi
+if [[ "${hyperdags}" == "yes" ]]; then
+ if [[ "${hyperdags_using}" != "reference" ]]; then
+ printf "Hyperdags backend requested using the ${hyperdags_using} backend, "
+ printf "but only the reference backend is supported currently."
+ exit 255
+ fi
+ if [[ "${hyperdags_using}" == "reference" && "${reference}" == "no" ]]; then
+ printf "Hyperdags backend is selected using the reference backend, "
+ printf "but the reference backend was not selected."
+ exit 255
+ fi
+fi
+
if [[ "${lpf}" == "yes" ]]; then
if [[ -z "${LPF_INSTALL_PATH}" ]]; then
check_lpf
@@ -228,7 +259,7 @@ CURRENT_DIR="$(pwd)"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
# CONFIGURE CMAKE BUILDING INFRASTRUCTURE
-if [[ "${reference}" == "yes" || "${lpf}" == "yes" ]]; then
+if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" ]]; then
BUILD_DIR="${CURRENT_DIR}"
printf "Checking for cmake..."
@@ -287,6 +318,15 @@ the current directory before invocation or confirm the deletion of its content w
if [[ "${reference}" == "no" ]]; then
CMAKE_OPTS+=" -DWITH_REFERENCE_BACKEND=OFF -DWITH_OMP_BACKEND=OFF"
fi
+ if [[ "${hyperdags}" == "no" ]]; then
+ CMAKE_OPTS+=" -DWITH_HYPERDAGS_BACKEND=OFF"
+ fi
+ if [[ "${hyperdags}" == "yes" ]]; then
+ CMAKE_OPTS+=" -DWITH_HYPERDAGS_USING=${hyperdags_using}"
+ fi
+ if [[ "${nonblocking}" == "no" ]]; then
+ CMAKE_OPTS+=" -DWITH_NONBLOCKING_BACKEND=OFF"
+ fi
if [[ "${lpf}" == "yes" ]]; then
CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'"
fi
diff --git a/changelog.md b/changelog.md
index 72aa3e4d6..3a77e6b5e 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,128 @@
+Version 0.7.0
+=============
+
+This is a summary of changes. For full details, see the publicly available Git
+history prior to the v0.7 tag.
+
+Highlights:
+
+ 1. This release re-implements the nonblocking ALP/GraphBLAS backend by Mastoras
+ et al. (GrAPL/IPDPSW '22, TACO '23) on the latest ALP code base. The use of
+ the nonblocking backend for some algorithms results in multiple-factor
+ speedups versus standard blocking execution as well as versus external
+ industry-standard frameworks. This includes Eigen, which, like nonblocking
+ ALP/GraphBLAS, perform cross-operation fusion. Simply compile your ALP
+ programs using `grbcxx -b nonblocking`, and enjoy the speedups!
+
+ 2. We also introduce a new programming interface to the ALP software stack that
+ allows vertex-centric programming in addition to programming using
+ generalised sparse linear algebra. This new interface, ALP/Pregel,
+ translates vertex-centric programs to standard ALP/GraphBLAS primitives
+ during compilation, and thus benefits of all automatic optimisations
+ included with the ALP software stack.
+
+ 3. Support for software prefetching during `vxm` and `mxv` has been added to
+ the `reference` and `reference_omp` backends. Since optimal prefetch
+ settings and its overall effectiveness relies strongly on 1) the structure
+ of the sparse matrices and graphs considered as well as on 2) the algorithms
+ used on those data, this new feature is turned off by default. To use it,
+ please enable it via `include/graphblas/reference/config.hpp` and tune the
+ there-defined prefetch distances.
+
+ 4. Finally, this release includes another new backend, the `hyperdags` backend.
+ A program compiled with this backend will, after execution, dump a HyperDAG
+ representation of the ALP computations that the program executed.
+
+Changes to the specification:
+
+ 1. Any ALP primitive with ALP container output now takes a Phase argument.
+
+ 2. Clarify that the use of the `dense` descriptor also implies that the output
+ containers on entry must be dense. This applies also for out-of-place
+ primitives.
+
+Algorithms:
+ - [new] a vertex-centric PageRank-like algorithm implemented on top of the new
+ ALP/Pregel has been added;
+ - [new] a vertex-centric algorithm for strongly connected components on
+ undirected graphs implemented on top of ALP/Pregel has been added;
+ - [new] the algebraic k-core decomposition algorithm by Li et al. (HPEC '21)
+ has been added;
+ - [bug] the mpv algorithm performed one too many iterations, while all
+ associated tests used an ALP/GraphBLAS baseline-- v0.7 now instead verifies
+ against external ground truths;
+ - [bug] the label propagation algorithm relied on a bugged implementation of
+ `grb::set`, now fixed, while it now and when possible relies on `std::swap`
+ instead of performing explicit and expensive copies;
+ - [bug] the CG algorithm returned `SUCCESS` even it failed to converge within
+ the given number of maximum iterations.
+
+Operators:
+ - [new] v0.7 (re-)introduces the four less-than(-or-equal) and
+ greater-than(-or-equal) operators;
+
+All backends:
+ - [bug] fixed the behaviour of ALP containers under copy-assignment and
+ copy-construction;
+ - [bug] all variants of `foldl` and `foldr` previously could erroneously return
+ `ILLEGAL` in the presence of sparse vectors and/or masks;
+ - [bug] several primitives would not return `ILLEGAL` in the presence of the
+ `dense` descriptor when faced with sparse containers;
+ - [bug] all backends missed the implementation of at least one `eWiseMul`
+ variant;
+ - [bug] all backends missed the implementation of at least two `eWiseApply`
+ variants where both inputs are scalar;
+ - [feature] improved `_DEBUG` tracing and code style throughout.
+
+Reference and reference_omp backends:
+ - [bug] overlap detection of the output and output mask was erroneously
+ disabled for the `vxm` and `mxv` primitives, herewith fixed;
+ - [bug] `foldl` and `foldr` previously have employed unexpected casting
+ behaviour;
+ - [bug] multiple copy-assignment of the same vector could fail;
+ - [bug] the vector<-scalar<-vector `eWiseApply` using operators was in-place;
+ - [bug] the `eWiseApply` using sparse vector inputs and/or masks could in some
+ rare cases depending on structure and vector lengths generate incorrect
+ output;
+ - [bug] the implementation of the vector `grb::set` where the output container
+ was not already dense was in-place, while out-of-place semantics were
+ defined;
+ - [bug] the output-masked `eWiseMul` was bugged in the case where one of the
+ inputs was scalar;
+ - [bug] matrix containers with initial requested capacity zero could attempt
+ to access uninitialised memory, including even after a successful subsequent
+ `resize`;
+ - [performance] `foldl` and `foldr` using sparse vectors and/or masks were
+ previously not always following asymptotically optimal behaviour;
+ - [performance] `set` previously did not exploit information such as whether
+ the `dense` descriptor was present, whether vectors need only touch
+ coordinate data to generate correct output, or whether it never needs to
+ touch coordinate data;
+ - [performance] `eWiseApply` detects more cases of trivial operations on empty
+ vectors, and completes those faster;
+ - [performance] optimised `eWiseMul` with scalar inputs.
+
+BSP1D and hybrid backends:
+ - [bug] the output-masked `vxm` and various `foldl` and `foldr` were missing;
+ - [bug] copy-assignment operator for vectors was missing.
+
+Testing, development, and documentation:
+ - the unit test suite has been hardened to detect all aforementioned bugs;
+ - outdated documentation was revised-- in particular, all user-facing
+ documentation has been checked and can now be generated via the new make
+ target `make userdocs`;
+ - developer documentation is now built via `make devdocs`, while the older
+ `make docs` target now builds both the user and developer documentation;
+ - new developers can now enjoy an updated developer guide;
+ - the test suite now prints an error when the automatic detection of the number
+ of sockets fails, and then auto-selects one instead of zero (which caused the
+ test scripts to fail);
+ - added performance tests for the sparse matrix--vector, sparse matrix--sparse
+ vector, and sparse matrix--sparse matrix multiplication kernels;
+ - improved both the GitHub and internal CI scripts.
+
+
Version 0.6.0
=============
@@ -8,7 +132,7 @@ history prior to the v0.6 tag.
Highlights and changes to the specification:
- Deprecated `grb::init` and `grb::finalize` in favour of grb::Launcher.
Existing code should migrate to using the Launcher as any later release may
- remove the now-deprecated primtives.
+ remove the now-deprecated primitives.
- If you wish to rely on ALP/GraphBLAS for more standard sparse linear
algebra but if you cannot, or do not wish to, adapt your existing sources
to the C++ ALP/GraphBLAS API, then v0.6 onwards generates libraries that
@@ -70,7 +194,7 @@ Reference and reference_omp backends:
properly updated.
- Bugfix: the OpenMP `schedule( static, chunk_size )` has a dynamic (run-time)
component that was not intended.
- - Bugifx: some OpenMP `schedule( dynamic, chunk_size )` operate on regular
+ - Bugfix: some OpenMP `schedule( dynamic, chunk_size )` operate on regular
loops and should employ a static schedule instead.
BSP1D backend:
@@ -198,7 +322,7 @@ BSP1D and hybrid backends:
declared as part of BSP1D friend declarations. Curiously, many compilers
accepted the previous erroneous code.
- Bugfix: empty BSP1D containers could previously leave process-local matrices
- unitialised.
+ uninitialised.
Reference and reference_omp backends:
- Bugfix: matrix construction did not use the `alloc.hpp` mechanisms. This
@@ -207,7 +331,7 @@ Reference and reference_omp backends:
All backends:
- Bugfix: `grb::Launcher` (as well as the benchmarker) did not always properly
- finalize the ALP/GraphBLAS context after exec completed. This caused some
+ finalise the ALP/GraphBLAS context after exec completed. This caused some
memory to not be properly freed on program exits.
- Bugfix: the out-of-place versions of `grb::operators::{argmin,argmax}` were
incorrect. All code within the repository was unaffected by this bug. The
@@ -224,7 +348,7 @@ Version 0.4.1
- The CG algorithm assumed out-of-place behaviour of grb::dot, while the
specification since v0.1 defines it to be in-place. Implementations of
grb::dot were erroneously out-of-place until v0.4, but the CG algorithm
- was errouneously not updated. This hotfix rectifies this.
+ was erroneously not updated. This hotfix rectifies this.
Version 0.4.0
@@ -276,36 +400,46 @@ Version 0.3.0
=============
Reference and reference_omp backends:
- - Fixed issue where grb::set, grb::vxm, and grb::mxv could fail for more exotic data types.
- - Fixed issue that prevented std::move on matrices, both from assignment and construction.
+ - Fixed issue where grb::set, grb::vxm, and grb::mxv could fail for more
+ exotic data types.
+ - Fixed issue that prevented std::move on matrices, both from assignment and
+ construction.
- Optimised masked grb::set to now reach optimal complexity in all cases.
- Optimised grb::eWiseLambda over matrices to avoid atomics.
BSP1D backend:
- - Fixed issue where iterating over empty matrices could fail in the BSP1D backend.
- - Fixed issue in BSP1D backend that caused dynamic allocations where they were not allowed.
- - Fixed issue where the automatic-mode launcher and benchmarker could, in rare cases, fail.
+ - Fixed issue where iterating over empty matrices could fail in the BSP1D
+ backend.
+ - Fixed issue in BSP1D backend that caused dynamic allocations where they were
+ not allowed.
+ - Fixed issue where the automatic-mode launcher and benchmarker could, in rare
+ cases, fail.
- Fixed issue where, under rare conditions, the stack-based combine could fail.
- - Fixed performance bug in the BSP1D backend causing spurious calls to lpf_sync.
+ - Fixed performance bug in the BSP1D backend causing spurious calls to
+ lpf_sync.
Level-3 functionality, all backends:
- Fixed issue where a masked set-to-value on matrices would fail.
- - Fixed issue where mxm could work with unitialised values when more exotic semirings are used.
- - Fixed issue that prevented std::move on matrices, both from assignment and construction.
+ - Fixed issue where mxm could work with uninitialised values when more exotic
+ semirings are used.
+ - Fixed issue that prevented std::move on matrices, both from assignment and
+ construction.
- New level-3 function: eWiseApply.
(Note that the interface of level-3 functionality remains experimental.)
Algorithms and utilities:
- - Fixed issue where MatrixFileReader would store unitialised values when reading pattern matrices.
+ - Fixed issue where MatrixFileReader would store uninitialised values when
+ reading pattern matrices.
- Updated the sparse neural network inference algorithm.
- New algorithm added: spy.
Others:
- Fixed issue where a `make clean` would miss some object files.
- - Added new unit and performance tests, including those for detecting the above-described bug
- fixes and added functionality.
- - Documentation update in line with the upcoming revision of the C++ GraphBLAS paper.
+ - Added new unit and performance tests, including those for detecting the
+ above-described bug fixes and added functionality.
+ - Documentation update in line with the upcoming revision of the C++ GraphBLAS
+ paper.
- Added some missing documentation.
- Code style fixes and some dead code removal.
@@ -313,7 +447,8 @@ Others:
Version 0.2.0
=============
-Fix some issues in the Banshee backend that appeared after refactoring for the 0.1.0 release.
+Fix some issues in the Banshee backend that appeared after refactoring for the
+0.1.0 release.
Removes --deps option from ./configure as it was no longer used.
diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake
index f4b254b8f..94bd58f31 100644
--- a/cmake/AddGRBInstall.cmake
+++ b/cmake/AddGRBInstall.cmake
@@ -18,8 +18,8 @@
# defines variables for the creation of wrapper scripts and the installation
#
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_BSP1D_BACKEND
- WITH_HYBRID_BACKEND WITH_NUMA
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_NONBLOCKING_BACKEND
+ WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND WITH_NUMA
)
assert_valid_variables( CMAKE_INSTALL_PREFIX AVAILABLE_BACKENDS CMAKE_CXX_COMPILER )
@@ -44,6 +44,7 @@ install( EXPORT GraphBLASTargets
# paths where to install the binaries of the various backends
set( ALP_UTILS_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}" )
set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" )
+set( HYPERDAGS_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hyperdags" )
set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" )
set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" )
@@ -112,7 +113,7 @@ endif()
# paths may have spaces, hence wrap them inside single quotes ''
# shared memory backends
-if ( WITH_REFERENCE_BACKEND )
+if( WITH_REFERENCE_BACKEND )
addBackendWrapperGenOptions( "reference"
COMPILE_DEFINITIONS "${REFERENCE_SELECTION_DEFS}"
LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
@@ -128,6 +129,23 @@ if( WITH_OMP_BACKEND )
)
endif()
+# dependent backends
+if( WITH_HYPERDAGS_BACKEND )
+ addBackendWrapperGenOptions( "hyperdags"
+ COMPILE_DEFINITIONS "${HYPERDAGS_SELECTION_DEFS};${HYPERDAGS_INCLUDE_DEFS}"
+ LINK_FLAGS "'${HYPERDAGS_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+ "'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+ )
+endif()
+
+if( WITH_NONBLOCKING_BACKEND )
+ addBackendWrapperGenOptions( "nonblocking"
+ COMPILE_DEFINITIONS "${NONBLOCKING_SELECTION_DEFS};${NONBLOCKING_INCLUDE_DEFS}"
+ LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'"
+ "'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}"
+ )
+endif()
+
# distributed memory backends
if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
assert_valid_variables( LPFRUN LPFCPP )
diff --git a/cmake/AddGRBTests.cmake b/cmake/AddGRBTests.cmake
index d05be44c8..cec04eb68 100644
--- a/cmake/AddGRBTests.cmake
+++ b/cmake/AddGRBTests.cmake
@@ -31,9 +31,6 @@ assert_valid_variables( ALL_BACKENDS AVAILABLE_BACKENDS TEST_CATEGORIES
# create variables to store tests against each backend
foreach( b ${AVAILABLE_BACKENDS} )
- if( NOT TARGET "backend_${b}" )
- message( FATAL_ERROR "Needed target backend_${b} does not exist!" )
- endif()
define_property( GLOBAL PROPERTY tests_backend_${b} BRIEF_DOCS "${b} tests" FULL_DOCS "tests for backend ${b}" )
endforeach()
diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake
index 2b1bc012b..fab0f9ac9 100644
--- a/cmake/AddGRBVars.cmake
+++ b/cmake/AddGRBVars.cmake
@@ -21,8 +21,8 @@
# to add a new backend, add your own to each ### SECTION
#
-assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_BSP1D_BACKEND
- WITH_HYBRID_BACKEND WITH_NUMA
+assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_NONBLOCKING_BACKEND
+ WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
)
### STANDARD TARGET NAMES
@@ -31,18 +31,26 @@ set( REFERENCE_BACKEND_DEFAULT_NAME "backend_reference" )
set( REFERENCE_OMP_BACKEND_DEFAULT_NAME "backend_reference_omp" )
set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" )
set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" )
-
+set( HYPERDAGS_BACKEND_DEFAULT_NAME "backend_hyperdags" )
+set( NONBLOCKING_BACKEND_DEFAULT_NAME "backend_nonblocking" )
### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION
# compiler definitions to include backend headers
set( REFERENCE_INCLUDE_DEFS "_GRB_WITH_REFERENCE" )
set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" )
+set( HYPERDAGS_INCLUDE_DEFS "_GRB_WITH_HYPERDAGS" )
+set( NONBLOCKING_INCLUDE_DEFS "_GRB_WITH_NONBLOCKING" )
set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" )
# compiler definitions to select a backend
set( REFERENCE_SELECTION_DEFS "_GRB_BACKEND=reference" )
set( REFERENCE_OMP_SELECTION_DEFS "_GRB_BACKEND=reference_omp" )
+set( HYPERDAGS_SELECTION_DEFS
+ "_GRB_BACKEND=hyperdags"
+ "_GRB_WITH_HYPERDAGS_USING=${WITH_HYPERDAGS_USING}"
+)
+set( NONBLOCKING_SELECTION_DEFS "_GRB_BACKEND=nonblocking" )
set( BSP1D_SELECTION_DEFS
"_GRB_BACKEND=BSP1D"
"_GRB_BSP1D_BACKEND=reference"
@@ -56,8 +64,7 @@ set( HYBRID_SELECTION_DEFS
set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" )
### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER
-set( ALL_BACKENDS "reference" "reference_omp" "bsp1d" "hybrid" )
-
+set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" )
# list of user-enabled backends, for tests and wrapper scripts (do not change!)
set( AVAILABLE_BACKENDS "" )
@@ -66,7 +73,7 @@ set( AVAILABLE_BACKENDS "" )
# backends that are enabled by the user: append as in the following
# shared memory backends
-if ( WITH_REFERENCE_BACKEND )
+if( WITH_REFERENCE_BACKEND )
list( APPEND AVAILABLE_BACKENDS "reference" )
endif()
@@ -74,6 +81,15 @@ if( WITH_OMP_BACKEND )
list( APPEND AVAILABLE_BACKENDS "reference_omp" )
endif()
+# dependent backends
+if( WITH_HYPERDAGS_BACKEND )
+ list( APPEND AVAILABLE_BACKENDS "hyperdags" )
+endif()
+
+if( WITH_NONBLOCKING_BACKEND )
+ list( APPEND AVAILABLE_BACKENDS "nonblocking" )
+endif()
+
# distributed memory backends
if( WITH_BSP1D_BACKEND )
list( APPEND AVAILABLE_BACKENDS "bsp1d" )
diff --git a/docs/Build_and_test_infra.md b/docs/Build_and_test_infra.md
index 98b144fc1..e751cb0bd 100644
--- a/docs/Build_and_test_infra.md
+++ b/docs/Build_and_test_infra.md
@@ -534,7 +534,9 @@ which may be set via a variable like
set( EXAMPLE_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/example" )
```
-used in the following steps.
+used in the following steps. The same binary file may implement multiple
+backends. For example, both the reference and the OMP backend share
+the same binary file, i.e., the one generated for shared memory backends.
For convenience, the macro `addBackendWrapperGenOptions` is provided to
automatically generate the necessary variables according to the internal naming
diff --git a/docs/Development.md b/docs/Development.md
index 5bdb5af28..cfe72d5a1 100644
--- a/docs/Development.md
+++ b/docs/Development.md
@@ -15,30 +15,221 @@ See the License for the specific language governing permissions and
limitations under the License.
-# Development of ALP/GraphBLAS
-This document introduces the reader to the development of ALP/GraphBLAS.
+# ALP Development Style Guide
-ALP/GraphBLAS is written in C++11 and is mainly composed of header files with
-largely templated data structures and operations. This allows both
+This document introduces the reader to the development style of ALP.
+
+ALP is written in C++11 and is mainly composed of header files with largely
+templated data structures and operations. This allows both
1. strict compile-time checking of the data types and of the algebraic
abstractions (typically encoded as template parameters: see the
-[Semiring class](include/graphblas/semiring.hpp) for an example)
-2. specialized code generation, increasing performance
-
-## Code style tools and guidelines
-ALP/GraphBLAS follows certain code style rules in order to ensure readability
-and uniformity.
-
-To apply these rules, the directory `tools` contains the script
-`clang-format-linter.sh` to format (*lint*, in Unix jargon) the code
-accordingly, based on the `clang-format` tool.
-Version 11 or higher is requested for the settings to be applied; if you want to
-use a different version, you can alias it in Bash before invoking
-`tools/clang-format-linter.sh`, which directly calls the command
+[Semiring class](../include/graphblas/semiring.hpp) for an example);
+
+2. specialised code generation, increasing performance.
+
+Common patterns include [SFINAE](https://de.wikipedia.org/wiki/Substitution_failure_is_not_an_error)
+and in particular its combination with (algebraic) type traits, as well as
+copious use of `static_assert` and `constexpr`. The choice of ANSI C++11 is to
+balance the benefits of these more modern C++ constructs with the typical
+reluctance of applying the latest and greatest in software development tooling
+within production codes.
+
+Given that this is a template library, there are both rigid code styles as well
+as more rigid coding patterns to ensure the overall quality of the template
+library-- these are detailed in their respective sections. This document also
+includes a brief description of code style tools included with the repository,
+as well as a section on the use of the available build and test infrastructure.
+
+First, however, this section concludes with some brief comments on the overall
+code structure.
+
+## Encapsulation
+
+Template code that should not be exposed to ALP programmers (i.e., users of the
+ALP programming interface) should be encapsulated in an internal namespace such
+as, e.g., `grb::internal`. Non-templated code that should not be exposed to ALP
+programmers should be defined within `.cpp` files. Only functionality that is
+called by templated code should be exported during compilation of the ALP
+libraries that ALP programmers would link against. All code that may be used by
+ALP programmers should be documented thoroughly.
+
+## Utilities
+
+Utility functions that could be useful by ALP programmers and not just by ALP
+developers, should unambiguously be housed in the `include/graphblas/utils`
+directory, with the interfaces made available through the corresponding
+`grb::utils` namespace. These functionalities should therefore and ideally *not*
+be included in an internal namespace.
+
+## Test utilities
+
+Utility functions that are *only* useful for ALP unit, smoke, and/or performance
+tests should unambiguously be housed in the `tests/utils` directory. It should
+never be included with code functionalities for ALP programmers. These
+functionalities should never be included with the template library, neither as a
+header that could be invoked by ALP programmers, nor within an internal
+namespace or within an internal `.cpp` file.
+
+
+# Code style guidelines
+
+ALP follows certain code style rules in order to ensure readability and
+uniformity. An informal summary of the main points follows:
+
+1. alignment uses **spaces** while indentation uses **tabs**;
+
+2. indentation is increased after a line break that does not end with `;`,
+ increased after a line break with an unterminated `<`, `(` or `{` and
+ decreased after matching `;`, `>`, `)`, and `}`. Opening and closing
+ delimiters are the last, resp., first characters on every line-- i.e., the
+ commonly accepted indentation pattern;
+
+3. none of `;`, `<`, `(`, `{` should appear alone on a single line-- while if
+ the opening delimiters like `<` follows a keyword it should do so
+ immediately, without intermediate spaces;
+
+4. when a closing delimiter is far (in a vertical space sense) from its opening
+ pair, it should be followed by a comment that documents what it closes;
+
+5. keywords that induce indentation include `private:`, `protected:`, and
+ `public:`, which furthermore do not induce intermediate spaces between the
+ keyword and the `:`;
+
+6. indentation of pre-processor code (macros) uses spaces, not tabs, and ignores
+ tab-based indentation;
+
+7. a single line has maximum length of about 80 characters, not including
+ indentation, and never ends with white spaces (space characters or tab
+ characters);
+
+8. use spaces and parentheses liberally for increasing code readability and to
+ limit ambiguity, including for if-else blocks or for-loop blocks that consist
+ only of one (or an otherwise limited number of lines);
+
+9. files always end with an empty line, and includes two empty lines before
+ implementation starts (i.e., two empty lines after any comments, macro
+ guards, and includes before the first line of code);
+
+10. Classes and types use the CamelCase naming format, variables of any kind
+ (static, constexpr, global, or members) use camelCase, while constants of
+ any kind (static const, global const, constexpr const, etc.) use CAMELCASE.
+ Names shall furthermore be both self-descriptive and short. Namespaces are
+ camelcase.
+
+As the saying goes, exceptions prove the rules. For example, rule #3 could be
+viewed as a specific exception to rule #8. Exceptions that are not
+self-contained in the above set include:
+
+1. one long program line under rule #7 may be arbitrarily spread over two lines
+ even if it runs counter rule #3-- but not if it would spread over more than
+ two lines;
+
+2. OpenMP pragmas and compiler warning suppressions may ignore rule #6-- they
+ may follow regular tab-based indentation instead;
+
+3. the 80-character limit is not strictly enforced. For example, an OpenMP macro
+ of 83 characters on a single line is better readable than when split over
+ two;
+
+4. brackets in code bodies that limit the scope of some of the declaration
+ within the body, may, contrary to rule #3, appear alone on a single line.
+
+
+## Code style by examples:
+
+- `if( ... ) {`, not `if (...) {` or any other variant;
+
+- lines should never end with white space (tab or space characters);
+
+- `if( x == 5 ) {` instead of `if( x==5 ) {`;
+
+- only write `<<` or `>>` when doing bit shifts, never for nested templates;
+
+- the following is correct. It would *not* be correct to put the whole block on
+ a single line, nor would it be correct to write it without any curly brackets;
+
+```c++
+if( ... ) {
+ return SUCCESS;
+}
+```
+
+- the following is correct w.r.t. vertical spacing;
+
+```c++
+/*
+ * copyright info
+ */
+
+/**
+ * @file
+ *
+ * File documentation
+ *
+ * @author Author information
+ * @date Date of initial creation
+ */
+
+#ifndef MACRO_GUARD
+#define MACRO_GUARD
+
+// note that two empty lines follow:
+
+
+namespace alp {
+
+ // ...
+
+}
+
+#endif
+
+// note that one empty line follows:
+
+```
+
+- encapsulation using curly bracket delimiters that both appear on a single
+ line:
+
+```c++
+void f( ... ) {
+ // some code block dubbed "A"
+ // ...
+ // end code block A
+ size_t ret;
+ {
+ // some code block with ields and containers that are used *solely* for
+ // for computing ret
+ // ...
+ ret = ...;
+ }
+ // some code that uses ret as well as fields, containers, and anything else
+ // that was defined in code block A
+}
+```
+
+
+# Code style tools
+
+There currently exist two tools to help check developer's code styles: the Clang
+linter script `clang-format-linter.sh`, and the `detectSuspiciousSpacing.sh`
+script.
+
+## Clang linter
+
+To automatically and approximately correctly check whether code style rules are
+followed properly, the directory `tools` contains the script
+`clang-format-linter.sh` that formats (*lints*, in Unix jargon) the source code,
+based on the `clang-format` tool.
+
+Version 11 or higher of the tool is required. If you want to use a different
+version, you can alias it in Bash before invoking
+`tools/clang-format-linter.sh`, which otherwise directly calls the command
`clang-format-11`.
-This tools is available in the standard repositories of the main Linux
+
+This tools is available in the standard repositories of all main Linux
distributions: for example, in Ubuntu you can install it with
`apt-get install clang-format-11`.
@@ -47,7 +238,8 @@ To list the script parameters, simply type
```bash
tools/clang-format-linter.sh -h
```
-For example, to lint the file `tests/add15d.cpp` and see the lint'ed code on the
+
+For example, to lint the file `tests/add15d.cpp` and see the linted code on the
standard output, type
```bash
@@ -66,55 +258,127 @@ Instead, to lint the whole ALP/GraphBLAS code-base in-place, type
tools/clang-format-linter.sh -i --lint-whole-grb
```
-The style rules enforced by the tool are
-
-- [x] lines are max 200 characters long, which means the line size is pretty
-liberal to avoid weird re-flows
-- [x] indents should be *tabs*, not spaces
-- [x] alignment should be done using spaces, not tabs
-- [x] essentially any line that ends in `{`, `(`, or whatever increases the
-current number of indents by one and vice versa
-- [x] argument lists (including template arguments) longer than 80 chars should
-be broken over multiple lines
-- [x] `if( `, not `if (` (also for `for`, etc.)
-- [x] no lines with indents and curly brackets only: put curly brackets on the
-same line as what starts that code block instead (only exception: code blocks
-that are not started by standard C++ key words, but e.g. required pragmas
-instead)
-- [x] no lines ending with spaces
-- [x] `#ifdef`, `#else`, `#endif` etc are never indented.
-- [x] comment blocks are capped at 80 chars per line
-- [x] include lines primarily ordered by
- 1. standard includes
- 2. external libraries
- 3. internal headers/files
-
-The following rules are also mandated, but cannot currently be applied via
-`clang-format`; however, developers should abide by the following guidelines as
-well:
-
-* files should end with an empty line
-* no `if`, `for`, `while`, or any other control structure without curly
-* brackets, even if what follows is a single statement
-* OpenMP pragmas (or any pragma) are indented as regular code
-* nested `ifdef`s etc. in close proximity of one another are indented by spaces
-
-The following guidelines are not strictly requested nor enforced, but are
-suggested to ensure readability and uniformity:
-
-* be gratuitous with spaces and parenthesis: anything that could possibly be
-construed as confusing or ambiguous should be clarified with spaces and
-parentheses if that removes (some of the) possible confusion or ambiguity
-* in particular, whenever it is legal to put one or more spaces, put one
-(e.g., `if( x == 5 )` instead of `if( x==5 )`)
-* in particular, only write `<<` or `>>` when doing bit shifts, not when
-performing template magic
-* when closing a block (either `#endif` or `}`) and the block was long (whatever
-long may be), add a comment on what it is that is being closed
-* all functions should have `doxygen`-friendly documentation
-* minimise the use of pre-processor macros (use C++11 `constexpr` instead)
-
-## Building and Testing infrastructure
+### Warning
+
+This tool is only approximately correct in terms of the code style described
+above(!)
+
+
+## Automated detection of suspicious spacing
+
+Many code reviews have exposed erroneous use of spaces, primarily due to editors
+attempting to be helpful in automatically replicating code styles like
+indentations. Before committing code, a careful submitter may opt to execute
+something like the following:
+
+```
+# go into a source directory where you have committed changes
+$ cd include/graphblas/nonblocking
+# **from within that directory** execute the helper script:
+$ ../../../tools/detectSuspiciousSpacing.sh
+```
+
+If all is OK, the output of the above would print the following to the standard
+output stream (which also immediately documents which patterns the script is
+tailored to detect):
+
+```
+Detecting suspicious spacing errors in the current directory, /path/to/source/include/graphblas/nonblocking
+ spaces, followed by end-of-line...
+ tabs, followed by end-of-line...
+ spaces followed by a tab...
+$
+```
+
+Seeing no `grep` output between the noted patterns (or between the last noted
+pattern and the prompt) means that no such patterns have been found within any
+source file in the current directory, including source files in a subdirectory
+to the current path.
+
+
+# Coding patterns for general code quality
+
+Some major coding rules for maintaining high code quality include:
+
+1. files always display the copyright and license header, and documents the
+ initial author information and date of file creation;
+
+2. limit the use of macros and in particular, never leak macro definitions to
+ user code;
+3. do not use `using` in a way that leaks to user code-- in particular,
+ never use it in headers;
+
+4. separate includes by their source -- e.g., a group of STL includes followed
+ by a group of internal utility header includes, and so on;
+
+5. code documentation uses [doxygen](https://www.doxygen.nl/) format, and in
+ particular the [Javadoc](https://www.doxygen.nl/manual/docblocks.html#cppblock)
+ style;
+
+6. use `constexpr` fields or functions in favour of any pre-processor macros,
+ and avoid global constants, especially those that leak to user code;
+
+7. performance parameters are never hardcoded but instead embedded (and
+ documented!) into the applicable `config.hpp` file.
+
+
+# Building and Testing infrastructure
+
+To use the build and test infrastructure, see the [main README](../README.md).
To modify it, you should refer to the
[dedicated documentation](Build_and_test_infra.md).
+
+
+## Testing before committing
+
+A careful committer may wish to run smoke or unit tests before committing to the
+main repository. Such developers may wish to take note of the script contained
+in the tests directory, `tests/summarise.sh`, which may be used to quickly
+analyse a test log file: it summarises how many tests have passed, how many have
+been skipped, and how many have failed.
+
+Additionally, if at least one test has failed, or if none of the tests have
+succeeded (indicating perhaps a build error), then the entire log will be
+`cat`-ted.
+
+A common use is to, in one terminal, execute:
+
+```bash
+$ cd build
+$ make -j88 smoketests &> smoketests.log
+```
+
+While in another, and while the above command is running, to execute:
+
+```bash
+$ cd build
+$ watch ../tests/summarise.sh smoketests.log
+```
+
+The second terminal then gives ``live'' feedback on the progress of the tests.
+
+## Continuous integration
+
+GitHub actions have been deployed to run smoke tests using both performance and
+debug flags. These tests are run on standard images that do not include the
+the datasets that some smoke tests require -- those tests are hence skipped.
+
+An internal CI to the Computing Systems Lab at the Huawei Zurich Research Center
+exists, but can only be triggered by its employees. This CI also performs unit
+tests, in addition to smoke tests. At present, however, it too does *not* employ
+images that have the required dataset embedded or accessible.
+
+The `develop` and `master` branches are tested by the internal CI on a regular
+schedule, in addition to being triggered on every push, and run a more
+comprehensive combination of test suites and compilation (debug/release) flags.
+Also release candidate branches (i.e., branches with names that match the
+wild-card expression `*-rc*`) are subject to the same more extensive test suite.
+
+All CI tests at present skip tests that require data sets, and therefore
+developers are suggested to not skip running local tests manually, at least once
+before flagging a merge request as ready and requesting a review. Even if at
+some point the CI does provide data sets, the practice of developers
+self-checking MRs is recommended as it naturally also induces greater robustness
+across compilers and distributions.
+
diff --git a/docs/Nonblocking_backend.md b/docs/Nonblocking_backend.md
new file mode 100644
index 000000000..f791b36d0
--- /dev/null
+++ b/docs/Nonblocking_backend.md
@@ -0,0 +1,921 @@
+
+
+ Copyright 2021 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+
+# Design and implementation of the nonblocking backend
+
+The [C API specification](https://graphblas.org/docs/GraphBLAS_API_C_v1.3.0.pdf) of [GraphBLAS](https://graphblas.org) defines two execution modes: blocking execution and nonblocking execution. In the blocking mode, the invocation of an operation implies that the computation is completed and the result is written to memory when the function returns. The nonblocking execution allows an operation to return although the result has not been computed yet. Therefore, the nonblocking execution may delay the execution of some operations to perform optimisations. Lazy evaluation is the key idea in nonblocking execution, and computations are performed only when they are required for the sound execution of a program.
+
+For the description of the full design and experimental results for nonblocking execution in ALP/GraphBLAS, please read the following publications.
+
+* A. Mastoras, S. Anagnostidis, and A. N. Yzelman, "Design and Implementation for Nonblocking Execution in GraphBLAS: Tradeoffs and Performance," ACM Trans. Archit. Code Optim. 20, 1, Article 6 (March 2023), 23 pages, [https://doi.org/10.1145/3561652](https://doi.org/10.1145/3561652)
+* A. Mastoras, S. Anagnostidis, and A. N. Yzelman, "Nonblocking execution in GraphBLAS," 2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), 2022, pp. 230-233, doi: [10.1109/IPDPSW55747.2022.00051](10.1109/IPDPSW55747.2022.00051).
+
+ALP/GraphBLAS provides the `nonblocking` backend that performs multi-threaded nonblocking execution on shared-memory systems. The implementation of the `nonblocking` backend relies on that of the `reference` and `reference_omp` backends that perform sequential and multi-threaded blocking execution, respectively.
+
+
+## Overview of the sources files
+
+The source files for the `nonblocking` backend are maintained under the `src/graphblas/nonblocking` directory, and the header files are maintained under `include/graphblas/nonblocking`. Most of these files exist for the `reference` backend, and the `nonblocking` backend uses some additional files. In particular, the full list of the source files for the `nonblocking` backend are the following:
+
+* `analytic_model.cpp`
+* `init.cpp` (relies on `reference/init.cpp`)
+* `io.cpp`
+* `lazy_evaluation.cpp`
+* `pipeline.cpp`
+
+from which the `analytic_model.cpp`, `lazy_evaluation.cpp`, and `pipeline.cpp` exist only for the `nonblocking` backend, and they are the main source files for the implementation of the nonblocking execution. The `init.cpp` file invokes the corresponding functions of the `reference` backend. The header files of the `nonblocking` backend include:
+
+* `alloc.hpp` (delegates to `reference/alloc.hpp`)
+* `analytic_model.hpp`
+* `benchmark.hpp` (delegates to `reference/benchmark.hpp`)
+* `blas1.hpp`
+* `blas2.hpp`
+* `blas3.hpp`
+* `boolean_dispathcer_blas1.hpp`
+* `boolean_dispathcer_blas2.hpp`
+* `boolean_dispathcer_io.hpp`
+* `collectives.hpp` (delegates to `reference/collectives.hpp`)
+* `config.hpp`
+* `coordinates.hpp`
+* `exec.hpp` (delegates to `reference/exec.hpp`)
+* `forward.hpp`
+* `init.hpp`
+* `io.hpp`
+* `lazy_evaluation.hpp`
+* `matrix.hpp`
+* `pinnedVector.hpp`
+* `pipeline.hpp`
+* `properties.hpp`
+* `spmd.hpp` (delegates to `reference/spmd.hpp`)
+* `vector.hpp` (relies on `reference/vector.hpp`)
+* `vector_wrapper.hpp`
+
+from which the `analytic_model.hpp`, `boolean_dispathcer_blas1.hpp`, `boolean_dispathcer_blas2.hpp`, `boolean_dispathcer_io.hpp`, `lazy_evaluation.hpp`, `pipeline.hpp`, and `vector_wrapper.hpp` are used only for the `nonblocking` backend.
+The current implementation supports nonblocking execution only for level-1 and level-2 operations defined in the following files:
+
+* `nonblocking/io.hpp`
+* `nonblocking/blas1.hpp`
+* `nonblocking/blas2.hpp`
+
+and thus most of the code for the nonblocking execution is found in these three files. The level-3 operations defined in `blas3.hpp` and some defined in `blas2.hpp` incur blocking behaviour. If a program invokes these primitives while compiled using the nonblocking backend, a warning will be emitted to the standard error stream. Please check regularly for future releases that enable native nonblocking execution for these remaining primitives.
+
+
+## Lazy evaluation
+
+Lazy evaluation enables the loop fusion and loop tiling optimisations in a pure library implementation such as required by ALP/GraphBLAS. Dynamic data dependence analysis identifies operations that share data, and these operations are added as stages of the same pipeline. Operations grouped into the same pipeline may be executed in parallel and reuse data in cache. The design for nonblocking execution is fully dynamic, since the optimisations are performed at run-time and the pipelines may include operations of arbitrary control-flow. The nonblocking execution is fully automatic, since the performance parameters, i.e., the number of threads and the tile size, are selected based on an analytic model (defined in `analytic_model.cpp`).
+
+To illustrate lazy evaluation for the nonblocking backend, we use the `grb::set` operation that initialises all the elements of the output vector `x` with the value of an input scalar `val`. The code below shows the implementation of `grb::set` for the `reference` and `reference_omp` backends found in `reference/io.hpp`.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType, typename T,
+ typename Coords
+>
+RC set(
+ Vector< DataType, reference, Coords > &x,
+ const T val,
+ ...
+) {
+ ...
+
+ const size_t n = size( x );
+ if( (descr & descriptors::dense) && nnz( x ) < n ) {
+ return ILLEGAL;
+ }
+
+ const DataType toCopy = static_cast< DataType >( val );
+
+ if( !(descr & descriptors::dense) ) {
+ internal::getCoordinates( x ).assignAll();
+ }
+ DataType * const raw = internal::getRaw( x );
+
+#ifdef _H_GRB_REFERENCE_OMP_IO
+ #pragma omp parallel
+ {
+ size_t start, end;
+ config::OMP::localRange( start, end, 0, n );
+#else
+ const size_t start = 0;
+ const size_t end = n;
+#endif
+ for( size_t i = start; i < end; ++ i ) {
+ raw[ i ] = internal::template ValueOrIndex< descr, DataType, DataType >::getFromScalar( toCopy, i );
+ }
+#ifdef _H_GRB_REFERENCE_OMP_IO
+ }
+#endif
+
+ assert( internal::getCoordinates( x ).nonzeroes() ==
+ internal::getCoordinates( x ).size() );
+
+ return SUCCESS;
+}
+```
+
+A typical operation of ALP/GraphBLAS includes a main for loop that iterates over all the elements (or only the nonzeroes) of the containers to perform the required computation. One additional step is to check if the `dense` descriptor is correctly used, i.e., none of the input and output vectors is sparse, and otherwise the error code `grb::ILLEGAL` is returned. It is also necessary to properly assign the coordinates of the output vector. In the case of the `grb::set` operation, the raw data of the output vector are initialised with the value of the input scalar within the body of the main loop. The check for the correct usage of the `dense` descriptor is performed before the main loop, and all the coordinates of the output vector are assigned by invoking `assignAll`. That is, the initialisation of the coordinates is performed in one step, since the output vector will be dense after the completion of this operation. If the `dense` descriptor is given by the user, the vector is supposed to be already dense, and thus the invocation of `assignAll` is omitted.
+
+To implement lazy evaluation in the ALP/GraphBLAS library implementation, the code of an operation is not necessarily executed when the corresponding function is invoked. Instead, the loop is added into a lambda function that corresponds to a stage of a pipeline, and the lambda function is stored and executed later. Lambda functions are an implementation decision that meshes well with template-based programming in ALP/GraphBLAS. The code below shows the implementation of the `grb::set` operation discussed above for the corresponding nonblocking implementation defined in `nonblocking/io.hpp`.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType, typename T,
+ typename Coords
+>
+RC set(
+ Vector< DataType, nonblocking, Coords > &x, const T val,
+ ...
+) {
+ ...
+
+ RC ret = SUCCESS;
+
+ const DataType toCopy = static_cast< DataType >( val );
+ DataType * const raw = internal::getRaw( x );
+ const size_t n = internal::getCoordinates( x ).size();
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&x, toCopy, raw] (
+ internal::Pipeline &pipeline, size_t active_chunk_id, size_t max_num_chunks, size_t lower_bound, size_t upper_bound
+ ) {
+ (void) active_chunk_id;
+ (void) max_num_chunks;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ if( !already_dense_vectors ) {
+ bool already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+ Coords local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+ local_x.local_assignAllNotAlreadyAssigned();
+ assert( local_x.nonzeroes() == local_x.size() );
+
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+ }
+ }
+
+ for( size_t i = lower_bound; i < upper_bound; i++ ) {
+ raw[ i ] = internal::template ValueOrIndex< descr, DataType, DataType >::getFromScalar( toCopy, i );
+ }
+
+ return SUCCESS;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::IO_SET_SCALAR,
+ n, sizeof( DataType ), dense_descr, true,
+ &x, nullptr,
+ &internal::getCoordinates( x ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+The implementation of `grb::set` for the `nonblocking` backend is very similar to that of the `reference` and `reference_omp` backends. In particular, a lambda function is defined for the execution of a subset of consecutive iterations of the initial loop determined by the `lower_bound` and `upper_bound` parameters. Therefore, the main loop iterates from `lower_bound` to `upper_bound` to initialise the raw data of the output vector. The main difference between the `nonblocking` backend and the `reference` backend is the way the coordinates are handled. First, it is impossible to check if the `dense` descriptor is correctly given in the beginning of an operation, because the computation may not be completed yet due to lazy evaluation and the number of nonzeroes of a vector may not be up to date. Therefore, the check for the `dense` descriptor must be moved into the lambda function. However, the coordinates used by the `nonblocking` backend require a different mechanism than that used by the `reference` backend. The design of the coordinates mechanism for the `nonblocking` backend is presented in the next section.
+
+
+## Handling sparse vectors
+
+Vectors in ALP/GraphBLAS may be either sparse or dense. In the case of dense vectors, each operation accesses all the elements as shown above with the example of `grb::set`. However, to efficiently handle sparsity, it is necessary to maintain the coordinates of the nonzeroes, such that ALP/GraphBLAS operations access only the nonzeroes. Hence, each vector includes a so-called Sparse Accumulator (SPA), consisting of the following data to handle sparsity:
+
+* an unsigned integer `_cap` that stores the size of the vector;
+* an unsigned integer `_n` that stores the number of nonzeroes in the vector;
+* a boolean array, `_assigned`, of size`_cap` that indicates if the element of a coordinate is a nonzero; and
+* an unsigned integer array, `_stack`, that represents a stack and stores the coordinates of the assigned elements.
+
+A vector is dense when the number of nonzeroes is equal to the size of the vector, i.e., `_n = _cap`.
+The stack and the `_assigned` array are used only when accessing a sparse vector.
+For an empty vector, `_n = 0`, all the elements of `_assigned` are initialised to `false`, and the stack is empty.
+The assignment of the i-th element of a vector implies that:
+```cpp
+_stack[_n] = i;
+_assigned[i] = true;
+_n++
+```
+Therefore, the coordinates of the nonzeroes are not sorted; they are pushed to the stack in an arbitrary order. Iterating over the nonzeroes of a sparse vector is done via the stack, and thus access to the elements may happen in any order.
+
+The internal representation of a vector is sufficient to correctly and efficiently handle sparse vectors for sequential execution. However, this is not the case for multi-threaded execution, since simultaneous assignments of vector elements may cause data races. Protecting the stack and the counter of nonzeroes with a global lock is a trivial solution that leads to significant performance degradation. Therefore, it is necessary to design a different mechanism that is tailored to the needs of the nonblocking execution and exploits any information about accesses of elements by different threads.
+
+
+## Local coordinates mechanism
+
+The local coordinates mechanism is used for efficient handling of sparse vectors in parallel nonblocking execution and is implemented in `coordinates.hpp`. The local coordinates mechanism consists of a set of local views for the coordinates stored in the global stack. Each local view includes the coordinates of the nonzeroes for a tile of iterations, and each thread access its own local coordinates and any update to the sparsity structure of a vector is performed in the local view. The local coordinates mechanism requires initialisation of the local views before the execution of the pipeline and update of the global stack with the new nonzeroes after the execution of the pipeline.
+
+The local coordinates mechanism requires some additional data for each tile of a vector:
+
+* an unsigned integer array that stores the number of nonzeroes for each local view, which are read from the global stack during initialisation;
+* an unsigned integer array that stores the number of nonzeroes that were assigned to each local view during the execution of a pipeline;
+* a set of unsigned integer arrays that represent local stacks and store the local coordinates, i.e., each array corresponds to a different local view.
+
+The local coordinates mechanism relies on five main functions defined in `nonblocking/coordinates.hpp`. The local views are initialised via `asyncSubsetInit`. Each operation reads the state of the local view with `asyncSubset`, and it updates the state with `asyncJoinSubset` once the computation is completed. The invocation of `joinSubset` pushes the local coordinates to the global stack. None of these functions uses locks, and to avoid data races, `joinSubset` updates the global stack based on the prefix-sum computation for the number of new nonzeroes performed by `prefixSumComputation`.
+
+To illustrate the usage of the local coordinates mechanism in the `nonblocking` backend, we use the in-place `grb::foldl` operation shown below, which receives one output vector, one input vector and an operator.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename IOType, typename InputType, typename Coords
+>
+RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ ...
+) {
+ const size_t n = size( x );
+
+ ...
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&x, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t active_chunk_id, const size_t max_num_chunks,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+ already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+ }
+ }
+
+ if( sparse ) {
+ // performs the computation for the sparse case
+ ...
+ } else {
+ // performs the computation for the dense case
+ ...
+ }
+
+ if( !already_dense_output ) {
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &x, nullptr,
+ &internal::getCoordinates( x ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+The state of the local view is read for each vector accessed in an operation by invoking `asyncSubset`. The sparsity structure may be updated only for the output vector, and thus `asyncJoinSubset` is invoked only for the output vector to update the number of new nonzeroes. Operations consider the dense and the sparse case, and the executed path is determined at run-time based on the sparsity structure of the local coordinates. To avoid the overhead of initialising the local views, the `nonblocking` backend performs compile-time and runtime optimisations discussed in the next section. Therefore, `asyncSubset` and `asyncJoinSubset` are conditionally invoked depending on whether the corresponding vectors are already dense.
+
+
+## Optimisations for dense vectors
+
+To improve the performance of nonblocking execution, it is crucial to avoid the usage of the local views when the vectors are dense. It is possible to determine whether a vector is dense based on compile-time information from descriptors and runtime analysis. The first one implies zero runtime overhead, but the descriptors must be provided by the user.
+
+There exist two main differences between the compile-time information from descriptors and the runtime analysis.
+First, descriptors may apply to all vectors of an operation, whereas the runtime analysis applies to each individual vector of an operation. Second, descriptors refer to the vectors of a specific operation, whereas the runtime analysis refers to the state of a vector before the execution of a pipeline.
+
+### Compile-time descriptors
+
+The ALP/GraphBLAS implementation provides a set of descriptors defined in `include/graphblas/descriptors.hpp`, and they may be combined using bit-wise operators.
+A descriptor is passed to an operation and indicates some information about some or all of the output and input containers, e.g., vectors and matrices.
+Three of these descriptors are the following:
+
+* `dense` to indicate that all input and output vectors are structurally dense before the invocation;
+* `structural` that ignores the values of the mask and uses only its structure, i.e., the i-th element evaluates to true if any value is assigned to it; and
+* `invert_mask` that inverts the mask.
+
+The `dense` and `structural` descriptors may affect both correctness and performance, and `invert_mask` affect only the correctness of an operation. These three descriptors may be used to perform optimisations for the local coordinates mechanism. In particular, if the dense descriptor is provided, it implies that all the vectors accessed in an operation are dense before the invocation. Therefore, an operation can safely iterate over all the elements of the vectors without using neither the global nor the local coordinates.
+
+One exception is an out-of-place operation that receives a mask, since the dense descriptor itself does not guarantee that all the elements of a dense mask evaluate to true. Therefore, a dense output vector may become sparse once the computation is completed. That is, the output vector becomes empty in the beginning of the operation, and then each of its coordinates may be assigned depending on whether the corresponding element of the mask evaluates to true or not. Reading the elements of a mask does not require usage of the local coordinates when the dense descriptor is given. However, to avoid the usage of the local coordinates for the output vector of an out-of-place operation that receives a mask, both the `structural` and the `invert_mask` descriptors should be given in addition to the `dense` descriptor.
+
+### Runtime analysis
+
+The runtime analysis for dense vectors relies on a simple property of ALP/GraphBLAS. A vector that is already dense before the execution of a pipeline cannot become sparse during the execution of the pipeline unless the pipeline contains an out-of-place operation, i.e., `grb::set`, `grb::eWiseApply`, or `grb::clear` that makes the vector empty. The current design for nonblocking execution in ALP/GraphBLAS allows pipelines that include an out-of-place operation but does not allow pipelines that include the `grb::clear` operation.
+
+The nonblocking execution relies on the runtime analysis to determine whether a vector is already dense before the execution of a pipeline, only when the `dense` descriptor is not given by the user. For each already dense vector of a pipeline, neither the global nor the local coordinates are used unless the vector is the output of an out-of-place operation. Therefore, the overhead of the local coordinates mechanism is completely avoided.
+
+### Implementation of the optimisation
+
+To illustrate the implementation of the compile-time and runtime optimisations for dense vectors, we use one example of an in-place and one example of an out-of-place operation.
+The runtime analysis relies on the `allAlreadyDenseVectors` function that returns `true` when all the vectors accessed in a pipeline are already dense, and on `containsAlreadyDenseContainer` that returns `true` when a specific vector accessed in a pipeline is already dense.
+
+#### In-place operations
+
+In the case of an in-place operation, we use the example of the `grb::foldl` operation discussed earlier.
+The code below is included in the lambda function of `grb::foldl`.
+
+```cpp
+const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+bool already_dense_output = true;
+bool already_dense_input = true;
+
+if( !already_dense_vectors ) {
+ already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+ }
+}
+
+...
+
+if( !already_dense_output ) {
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, active_chunk_id, max_num_chunks );
+}
+```
+
+The variable `already_dense_vectors` indicates whether all the vectors accessed in this operation are already dense based on compile-time or runtime information.
+In addition, one variable is declared for each vector to indicate whether a vector is already dense, i.e., the variables `already_dense_output` and `already_dense_input` are initialised to `true`, assuming that the vectors are already dense.
+If `already_dense_vectors` is evaluated to true, the state of the local views is not read and the assumption for already dense vectors is correct.
+Otherwise, it is necessary to check if each vector accessed in the operation is already dense, and if this is not the case, the state of the local view is read by invoking `asyncSubset`.
+The update of the state for the local view is performed once the computation is completed via `asyncJoinSubset` only when the output vector is not already dense.
+
+#### Out-of-place operations
+
+For the implementation of the optimisation for dense vectors of an out-of-place operation, we use the example of the `grb::eWiseApply` operation defined in `blas1.hpp`.
+There exist four main scenarios we need to consider, depending on whether the output vector for a tile needs to become empty, dense, or both empty and dense, and whether the operation receives a mask.
+
+##### Out-of-place operation with a potentially sparse output vector
+
+In the case that the input consists of three vectors, the output vector will have an a-priori unknown sparsity structure.
+Therefore, unless all vectors are already dense, it is necessary to initialise the state of the output vector via `asyncSubset` and clear the coordinates of each local view by invoking `local_clear`.
+In contrast to an in-place operation, the decision about reading and updating the state of the output vector does not depend on whether the output vector is already dense,
+since an already dense output vector may become sparse depending on the sparsity structure of the input vectors.
+
+Since the current design for nonblocking execution does not allow the number of nonzeroes to decrease, it is necessary to reset the global counter of nonzeroes by invoking `reset_global_nnz_counter`.
+The `local_clear` function updates properly the number of new nonzeroes that should be written later to the global stack by `joinSubset`, i.e., all the nonzeroes of the local view are considered as new.
+In addition, the output vector is marked as potentially sparse by invoking `markMaybeSparseContainer`.
+Both of these functions are invoked only by the thread that executes the first tile, i.e., when `lower_bound = 0`.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+>
+RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ ...
+) {
+ const size_t n = internal::getCoordinates( z ).size();
+
+ ...
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t active_chunk_id, const size_t max_num_chunks,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y, local_z;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+ already_dense_input_x = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ const auto op = monoid.getOperator();
+
+ if( !already_dense_vectors ) {
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+ }
+ }
+
+ // performs the computation
+ ...
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr,
+ &internal::getCoordinates( z ), nullptr,
+ &x, &y, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ), nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+##### Out-of-place operation with a dense output vector
+
+In the case that the input consists of a scalar and a monoid, it is guaranteed that the output vector will be dense.
+Therefore, the only criterion to avoid the usage of the local views is whether the output vector is already dense.
+If the output vector is not already dense, then the state of the local view is read, all the not assigned coordinates are assigned by invoking `local_assignAllNotAlreadyAssigned`, and the state is updated via `asyncJoinSubset`.
+
+```cpp
+
+template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+>
+RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ ...
+) {
+ const size_t n = internal::getCoordinates( z ).size();
+
+ ...
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t active_chunk_id, const size_t max_num_chunks,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y, local_z;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ bool already_dense_output = true;
+ bool already_dense_input_y = true;
+
+ already_dense_output = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( z ) );
+ if( !already_dense_output ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+
+ if( !already_dense_vectors ) {
+ already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ const auto &op = monoid.getOperator();
+
+ if( !already_dense_output ) {
+ local_z.local_assignAllNotAlreadyAssigned();
+ }
+
+ // performs the computation
+ ...
+
+ if( !already_dense_output ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr,
+ &internal::getCoordinates( z ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+##### Out-of-place operation with an output vector that consists of some potentially sparse tiles and some dense tiles
+
+In the case that the input consists of an operator instead of a monoid, the output vector may become sparse after the computation unless all vectors are already dense.
+Therefore, the global counter of nonzeroes is reset, and the decision about clearing the local coordinates or assigning all of them is made separately for each local view.
+The vector is marked as potentially sparse when the local coordinates are cleared for at least one of the tiles.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+>
+RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ ...
+) {
+ const size_t n = internal::getCoordinates( z ).size();
+
+ ...
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, alpha, &y, &op] (
+ internal::Pipeline &pipeline,
+ const size_t active_chunk_id, const size_t max_num_chunks,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_y_nz = local_n;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+
+ already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ }
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ if( !already_dense_vectors ) {
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ }
+ }
+
+ if( (descr & descriptors::dense) || local_y_nz == local_n ) {
+ if( !already_dense_vectors ) {
+ local_z.local_assignAll( );
+ }
+
+ // performs the computation for the dense case
+ ...
+ } else {
+ if( !already_dense_vectors ) {
+ local_z.local_clear();
+ pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+ }
+
+ // performs the computation for the sparse case
+ ...
+ }
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr,
+ &internal::getCoordinates( z ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+##### Out-of-place operation that receives a mask
+
+In the case that an out-of-place operation receives a mask, a second variable, `mask_is_dense`, is used to indicate whether the mask is dense based on compile-time information from descriptors or the runtime analysis for already dense vectors.
+Then, all the decisions about the output vector are made based on this variable.
+In addition, the function `markMaybeSparseDenseDescriptorVerification` is invoked to mark the output vector as potentially sparse when the `dense` descriptor is provided and the elements of the mask may be evaluated to `false` as explained in the section about the dense descriptor verification.
+
+```cpp
+template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+>
+RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ ...
+) {
+ const size_t n = internal::getCoordinates( z ).size();
+
+ ...
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr && (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t active_chunk_id, const size_t max_num_chunks,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_y = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+ already_dense_mask = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+ local_mask = internal::getCoordinates( mask ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseContainer( &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+ local_y = internal::getCoordinates( y ).asyncSubset( active_chunk_id, max_num_chunks, lower_bound, upper_bound );
+ }
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ const InputType2 right_identity = monoid.template getIdentity< InputType2 >();
+ const auto &op = monoid.getOperator();
+
+ if( !mask_is_dense ) {
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ pipeline.markMaybeSparseContainer( &internal::getCoordinates( z ) );
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification( &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+ // performs the computation
+ ...
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, active_chunk_id, max_num_chunks );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ), internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr,
+ &internal::getCoordinates( z ), nullptr,
+ &y, &mask, nullptr, nullptr,
+ &internal::getCoordinates( y ), &internal::getCoordinates( mask ), nullptr, nullptr
+ );
+
+ return ret;
+}
+```
+
+
+## Pipeline execution
+
+The nonblocking execution in ALP/GraphBLAS expresses operations as a linear sequence of stages that form a pipeline. The execution of a pipeline is performed when the computation is necessary for the sound execution of the program. Opaqueness guarantees that lazy evaluation is safe when the output of an operation is a container, i.e., a vector or a matrix. The current version of ALP/GraphBLAS does not implement scalars as opaque data types according to the [version 1.3.0](https://graphblas.org/docs/GraphBLAS_API_C_v1.3.0.pdf) of the C API specification. Opaque scalars were introduced later in the [version 2.0.0](https://graphblas.org/docs/GraphBLAS_API_C_v2.0.0.pdf) and may further improve the performance of nonblocking execution.
+
+A pipeline must be executed in the following cases:
+
+* the user explicitly extracts data from a container by using the ALP/GraphBLAS API, e.g., when reading the elements of a vector by using iterators;
+
+* the user invokes the constructor of a container;
+
+* memory is deallocated due to a destructor invocation;
+
+* the invoked operation returns a scalar, e.g., the `grb::dot` operation, in particular, the operation is first added into the pipeline, and then the pipeline is executed immediately before returning the scalar;
+
+* when a sparse matrix–vector multiplication (SpMV) operation is added into a pipeline with another operation that overwrites the input vector to the SpMV;
+
+* when the user explicitly forces the execution of a pipeline via a call to `grb::wait`.
+
+Although level-3 operations are not yet implemented for nonblocking execution, a sparse matrix–sparse matrix multiplication (SpMSpM) operation implies the same constraint with SpMV, i.e., the SpMSpM operation cannot be fused together with another operation that overwrites any of the SpMSpM input matrices.
+
+When a new stage is added to a pipeline, the pipeline execution is performed within the `addStage` function of `lazy_evaluation.cpp`, which implements the dynamic data dependence analysis and identifies any shared data between operations. The pipeline execution due to explicit invocation of iterators or constructors or memory deallocation is performed in `vector.hpp`. The execution of a pipeline caused by `grb::wait` is implemented in `io.hpp`.
+
+The code for the pipeline execution is found in the `execution` method of `pipeline.cpp`. The execution is performed in four main steps, three of which may be omitted when the pipeline does not include any out-of-place operation and all accessed vectors are dense. Simplified code for the execution of the four main steps is shown below.
+
+```cpp
+bool initialized_coordinates = false;
+
+#pragma omp parallel for private(vt, pt) schedule(dynamic) num_threads(nthreads)
+for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+ ...
+ for( vt = vbegin(); vt != vend(); ++vt ) {
+ ...
+ (**vt).asyncSubsetInit( tile_id, tiles, lower_bound, upper_bound );
+ initialized_coordinates = true;
+ }
+}
+
+#pragma omp parallel for private(vt, pt) schedule(dynamic) num_threads(nthreads)
+for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+ ...
+ RC local_ret = SUCCESS;
+ for( pt = pbegin(); pt != pend(); ++pt ) {
+ local_ret = local_ret ? local_ret : (*pt)( *this, tile_id, tiles, lower_bound, upper_bound );
+ }
+ if( local_ret != SUCCESS ) {
+ ret = local_ret;
+ }
+}
+
+if( initialized_coordinates ) {
+ bool new_nnz = false;
+
+ for( vt = vbegin(); vt != vend(); ++vt ) {
+ ...
+ if( (**vt).newNonZeroes( tiles ) ) {
+ new_nnz = true;
+ (**vt).prefixSumComputation( tiles );
+ }
+ }
+
+ if( new_nnz ) {
+ #pragma omp parallel for private(vt) schedule(dynamic) num_threads(nthreads)
+ for( size_t tile_id = 0; tile_id < tiles; ++tile_id ) {
+ ...
+ for( vt = vbegin(); vt != vend(); ++vt ) {
+ ...
+ if( (**vt).newNonZeroes( tiles ) ) {
+ (**vt).joinSubset( tile_id, tiles, lower_bound, upper_bound );
+ }
+ }
+ }
+ }
+}
+```
+The local views of each vector accessed in the pipeline are initialised via `asyncSubsetInit`, and then the pipeline is executed. Once the execution is completed, the local views may contain a number of new nonzeroes that must be pushed to the global stack by `joinSubset`. Before this step, it is necessary to perform the prefix-sum computation for the number of new nonzeroes of each local view by invoking `prefixSumComputation`. All these steps may be executed in parallel for different tiles of the vectors as shown with the OpenMP directives, except for the prefix-sum computation that is parallelised internally. The scheduling policy used for OpenMP is dynamic to handle load imbalance, and the performance parameters, i.e., the number of threads and the tile size used in the lambda functions, are automatically selected by the analytic model (see `analytic_model.cpp`).
+
+
+## Analytic performance model
+
+The analytic performance model used for nonblocking execution consists of the `getPerformanceParameters` function defined in `analytic_model.cpp`, and this function is invoked before the pipeline execution within the `execution` method in `pipeline.cpp`. The analytic model makes an estimation about the number of threads and the tile size that lead to good performance for the execution of a given pipeline, and the estimation is based on various parameters such as the number of vectors accessed in the pipeline, the data type of the vectors, and the size of the vectors. Two additional parameters of special importance are the size of the L1 cache and the number of cores available in the system, since the selected tile size must allow data fit in L1 cache and there should be sufficient work to utilise as many cores as possible.
+
+The analytic model relies on two environment variables:
+
+* `OMP_NUM_THREADS`
+* `GRB_NONBLOCKING_TILE_SIZE`
+
+for the number of threads used by OpenMP and the tile size used by the nonblocking backend, respectively. The number of threads determined by the environment variable is an upper bound for the number of threads that may be selected by the analytic model. If the environment variable for the tile size is set, a fixed tile size is used for all executed pipelines. Otherwise, the analytic model automatically selects a proper tile size, depending on the parameters of the executed pipeline.
+
+The initialisation for the number of threads used by OpenMP and the manual tile size is performed in `init.cpp`, and the data of the analytic model are handled by the `ANALYTIC_MODEL` and `IMPLEMENTATION` classes of `config.hpp`.
+
+
+## Dense descriptor verification
+
+The correct usage of the `dense` descriptor, for the blocking execution, is checked in the beginning of each ALP/GraphBLAS operation.
+If there exists at least one input or output vector that is not dense, then the `grb::ILLEGAL` error code is returned as shown in the example below.
+
+```cpp
+const size_t n = size( x );
+if( (descr & descriptors::dense) && nnz( x ) < n ) {
+ return ILLEGAL;
+}
+```
+
+For the nonblocking execution, checking the correct usage of the `dense` descriptor requires a different process, since the number of nonzeroes in the vectors may not be up to date due to lazy evaluation.
+In particular, the check is moved within the lambda function defined for each operation, and the check for the sparsity structure is based on the local views.
+However, the optimisation employed by the nonblocking execution for already dense vectors implies that the local views are not always available.
+Therefore, it is not always possible to perform the check for correct usage of the `dense` descriptor within the lambda function of an operation.
+
+The verification process for correct usage of the `dense` descriptor relies on the following property:
+
+*A vector that should be dense when an operation is invoked, should remain dense after the execution of the pipeline, unless this vector is the output of an out-of-place operation that receives a mask with elements that may be evaluated to `false`*.
+
+Therefore, the `nonblocking` backend delays the check and performs the verification for correct usage of the `dense` descriptor after the pipeline execution.
+To keep track of the vectors that should be dense after the execution of the pipeline, the addition of a lambda function as a stage of a pipeline is accompanied by a boolean variable, called `dense_descr`, that indicates if the `dense` descriptor is given for this operation.
+In the case of an out-of-place operation that receives a mask, e.g., `grb::eWiseApply` discussed earlier, the output vector may be marked as potentially sparse when the `dense` descriptor is provided, by invoking `markMaybeSparseDenseDescriptorVerification` as shown in the example of `grb::eWiseApply` above.
+In this case, the dense descriptor verification is disabled for the output vector of this specific operation.
+
+This solution is efficient and catches most cases of an illegal `dense` descriptor.
+However, it cannot catch an illegal usage of the `dense` descriptor for an operation that receives a sparse vector, which becomes dense during the execution of the pipeline, since it is impossible to detect that the vector was not dense earlier.
+
diff --git a/docs/Suppressions.md b/docs/Suppressions.md
index 1915147b5..630b044ab 100644
--- a/docs/Suppressions.md
+++ b/docs/Suppressions.md
@@ -48,41 +48,17 @@ if( masked ) {
```
4. `include/graphblas/base/internalops.hpp`, multiple sources:
-- mul::apply, add::apply, add::foldl, equal::apply, not_equal::apply.
+- mul::apply, add::apply, add::foldl, equal::apply, not_equal::apply, and
+ logical_and::foldl.
These are indirectly caused by the following calls:
- `include/graphblas/blas0.hpp`, apply;
- `include/graphblas/reference/blas1.hpp`, dot_generic, masked_apply_generic,
- and sparse_apply_generic.
+ sparse_apply_generic, and fold_from_vector_to_scalar_generic.
These are all OK to suppress since the reads are masked.
-5. `include/graphblas/reference/blas1.hpp`, fold_from_vector_to_scalar_generic:
-```
-GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the below code ensures to set local
-IOType local; // whenever our local block is
-GRB_UTIL_RESTORE_WARNINGS // non-empty
-if( end > 0 ) {
- if( i < end ) {
- local = static_cast< IOType >( internal::getRaw( to_fold )[ i ] );
- } else {
- local = static_cast< IOType >( internal::getRaw( to_fold )[ 0 ] );
- }
-}
-```
-and
-```
-if( root == s ) {
- // then I should be non-empty
- assert( !empty );
- // set global value to locally computed value
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // one is only root if the local
- global = local; // chunk is non-empty, in which case
- GRB_UTIL_RESTORE_WARNINGS // local will be initialised (above)
- }
-```
-
-6. `include/graphblas/reference/blas1.hpp`, masked_apply_generic:
+5. `include/graphblas/reference/blas1.hpp`, masked_apply_generic:
```
if( mask_b[ t ] ) {
// ...
@@ -91,3 +67,18 @@ if( mask_b[ t ] ) {
GRB_UTIL_RESTORE_WARNINGS // if mask_b is true
```
+6. `include/graphblas/nonblocking/blas1.hpp`, masked_apply_generic:
+```
+for( size_t k = 0; k < block_size; ++k ) {
+ const size_t index = i + k;
+ assert( index < local_n + lower_bound );
+ if( mask_b[ k ] ) {
+ (void) local_z.assign( index - lower_bound );
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with
+ *( z_p + index ) = z_b[ k ]; // mask_b[ k ], which in the above
+ GRB_UTIL_RESTORE_WARNINGS // loop also triggeres initialising
+ // z_b[ k ]
+ }
+}
+```
+
diff --git a/docs/doxy.conf b/docs/doxy.conf
index d91dae080..d1e63f220 100644
--- a/docs/doxy.conf
+++ b/docs/doxy.conf
@@ -1,20 +1,4 @@
-# Doxyfile 1.8.14
-
-#
-# Copyright 2021 Huawei Technologies Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
+# Doxyfile 1.9.3
# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project.
@@ -33,10 +17,10 @@
# Project related configuration options
#---------------------------------------------------------------------------
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
# The default value is: UTF-8.
@@ -48,19 +32,19 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
-PROJECT_NAME = "ALP/GraphBLAS"
+PROJECT_NAME = "ALP Developer Documentation"
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = 0.6.0
+PROJECT_NUMBER = 0.7.0
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
-PROJECT_BRIEF =
+PROJECT_BRIEF = "Algebraic Programming Developer Documentation"
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
@@ -74,7 +58,7 @@ PROJECT_LOGO =
# entered, it will be relative to the location where doxygen was started. If
# left blank the current directory will be used.
-OUTPUT_DIRECTORY = docs/code
+OUTPUT_DIRECTORY = docs/developer
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# directories (in 2 levels) under the output directory of each output format and
@@ -149,7 +133,7 @@ ALWAYS_DETAILED_SEC = NO
# operators of the base classes will not be shown.
# The default value is: NO.
-INLINE_INHERITED_MEMB = NO
+INLINE_INHERITED_MEMB = YES
# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
# before files name in the file list and in the header files. If set to NO the
@@ -195,6 +179,16 @@ SHORT_NAMES = NO
JAVADOC_AUTOBRIEF = YES
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER = NO
+
# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
# line (until the first dot) of a Qt-style comment as the brief description. If
# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -215,6 +209,14 @@ QT_AUTOBRIEF = NO
MULTILINE_CPP_IS_BRIEF = NO
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING = YES
+
# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
# documentation from any documented member that it re-implements.
# The default value is: YES.
@@ -238,21 +240,19 @@ TAB_SIZE = 4
# the documentation. An alias has the form:
# name=value
# For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
# will allow you to put the command \sideeffect (or @sideeffect) in the
# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
ALIASES =
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST =
-
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
# only. Doxygen will then generate output that is more tailored for C. For
# instance, some of the names that are used will be different. The list of all
@@ -281,28 +281,40 @@ OPTIMIZE_FOR_FORTRAN = NO
OPTIMIZE_OUTPUT_VHDL = NO
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE = NO
+
# Doxygen selects the parser to use depending on the extension of the files it
# parses. With this tag you can assign which parser to use for a given
# extension. Doxygen has a built-in mapping, but you can override or extend it
# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
#
# Note: For files without extension you can use no_extension as a placeholder.
#
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
EXTENSION_MAPPING =
# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
# The output of markdown processing is further processed by doxygen, so you can
# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
# case of backward compatibilities issues.
@@ -314,7 +326,7 @@ MARKDOWN_SUPPORT = YES
# to that level are automatically included in the table of contents, even if
# they do not have an id attribute.
# Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
+# Minimum value: 0, maximum value: 99, default value: 5.
# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
TOC_INCLUDE_HEADINGS = 0
@@ -430,6 +442,19 @@ TYPEDEF_HIDES_STRUCT = NO
LOOKUP_CACHE_SIZE = 0
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS = 1
+
#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------
@@ -448,13 +473,19 @@ EXTRACT_ALL = NO
# be included in the documentation.
# The default value is: NO.
-EXTRACT_PRIVATE = NO
+EXTRACT_PRIVATE = YES
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL = NO
# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
# scope will be included in the documentation.
# The default value is: NO.
-EXTRACT_PACKAGE = NO
+EXTRACT_PACKAGE = YES
# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
# included in the documentation.
@@ -476,7 +507,7 @@ EXTRACT_LOCAL_CLASSES = YES
# included.
# The default value is: NO.
-EXTRACT_LOCAL_METHODS = NO
+EXTRACT_LOCAL_METHODS = YES
# If this flag is set to YES, the members of anonymous namespaces will be
# extracted and appear in the documentation as a namespace called
@@ -487,6 +518,13 @@ EXTRACT_LOCAL_METHODS = NO
EXTRACT_ANON_NSPACES = NO
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
# undocumented members inside documented classes or files. If set to NO these
# members will be included in the various overviews, but no documentation
@@ -504,8 +542,8 @@ HIDE_UNDOC_MEMBERS = NO
HIDE_UNDOC_CLASSES = NO
# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
# The default value is: NO.
HIDE_FRIEND_COMPOUNDS = NO
@@ -522,13 +560,20 @@ HIDE_IN_BODY_DOCS = NO
# will be excluded. Set it to YES to include the internal documentation.
# The default value is: NO.
-INTERNAL_DOCS = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+INTERNAL_DOCS = YES
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
# The default value is: system dependent.
CASE_SENSE_NAMES = YES
@@ -547,6 +592,12 @@ HIDE_SCOPE_NAMES = YES
HIDE_COMPOUND_REFERENCE= NO
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE = YES
+
# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
# the files that are included by a file in the documentation of that file.
# The default value is: YES.
@@ -704,7 +755,8 @@ FILE_VERSION_FILTER =
# output files in an output format independent way. To create the layout file
# that represents doxygen's defaults, run doxygen with the -l option. You can
# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
#
# Note that if you run doxygen from a directory containing a file called
# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -750,23 +802,35 @@ WARNINGS = YES
WARN_IF_UNDOCUMENTED = YES
# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
# The default value is: YES.
WARN_IF_DOC_ERROR = YES
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
# The default value is: NO.
WARN_NO_PARAMDOC = NO
# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
# The default value is: NO.
WARN_AS_ERROR = NO
@@ -783,7 +847,10 @@ WARN_FORMAT = "$file:$line: $text"
# The WARN_LOGFILE tag can be used to specify a file to which warning and error
# messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
WARN_LOGFILE =
@@ -802,8 +869,8 @@ INPUT = include/
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
# The default value is: UTF-8.
INPUT_ENCODING = UTF-8
@@ -816,11 +883,15 @@ INPUT_ENCODING = UTF-8
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
# read by doxygen.
#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
FILE_PATTERNS = *.hpp \
*.cpp \
@@ -862,7 +933,7 @@ EXCLUDE_PATTERNS =
# (namespaces, classes, functions, etc.) that should be excluded from the
# output. The symbol name can be a fully qualified name, a word, or if the
# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
+# ANamespace::AClass, ANamespace::*Test
#
# Note that the wildcards are matched against the file with absolute path, so to
# exclude all test directories use the pattern */test/*
@@ -980,7 +1051,7 @@ INLINE_SOURCES = NO
STRIP_CODE_COMMENTS = YES
# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
# The default value is: NO.
REFERENCED_BY_RELATION = NO
@@ -1017,7 +1088,7 @@ SOURCE_TOOLTIPS = YES
#
# To use it do the following:
# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
# - Make sure the INPUT points to the root of the source tree
# - Run doxygen as normal
#
@@ -1050,13 +1121,6 @@ VERBATIM_HEADERS = YES
ALPHABETICAL_INDEX = YES
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX = 5
-
# In case all classes in a project start with a common prefix, all classes will
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
# can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1156,7 +1220,7 @@ HTML_EXTRA_FILES =
# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
+# this color. Hue is specified as an angle on a color-wheel, see
# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
# purple, and 360 is red again.
@@ -1166,7 +1230,7 @@ HTML_EXTRA_FILES =
HTML_COLORSTYLE_HUE = 220
# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
# value of 255 will produce the most vivid colors.
# Minimum value: 0, maximum value: 255, default value: 100.
# This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1195,9 +1259,9 @@ HTML_TIMESTAMP = YES
# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
# documentation will contain a main index with vertical navigation menus that
-# are dynamically created via Javascript. If disabled, the navigation index will
+# are dynamically created via JavaScript. If disabled, the navigation index will
# consists of multiple levels of tabs that are statically embedded in every HTML
-# page. Disable this option to support browsers that do not have Javascript,
+# page. Disable this option to support browsers that do not have JavaScript,
# like the Qt help browser.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1227,13 +1291,14 @@ HTML_INDEX_NUM_ENTRIES = 100
# If the GENERATE_DOCSET tag is set to YES, additional index files will be
# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1247,6 +1312,13 @@ GENERATE_DOCSET = NO
DOCSET_FEEDNAME = "Doxygen generated docs"
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL =
+
# This tag specifies a string that should uniquely identify the documentation
# set bundle. This should be a reverse domain-name style string, e.g.
# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1272,8 +1344,12 @@ DOCSET_PUBLISHER_NAME = Publisher
# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
#
# The HTML Help Workshop contains a compiler that can convert all HTML output
# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1303,7 +1379,7 @@ CHM_FILE =
HHC_LOCATION =
# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
# The default value is: NO.
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
@@ -1348,7 +1424,8 @@ QCH_FILE =
# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
# The default value is: org.doxygen.Project.
# This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1356,7 +1433,8 @@ QHP_NAMESPACE = org.doxygen.Project
# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
# The default value is: doc.
# This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1364,28 +1442,30 @@ QHP_VIRTUAL_FOLDER = doc
# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_NAME =
# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_CUST_FILTER_ATTRS =
# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
# This tag requires that the tag GENERATE_QHP is set to YES.
QHP_SECT_FILTER_ATTRS =
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
# This tag requires that the tag GENERATE_QHP is set to YES.
QHG_LOCATION =
@@ -1428,16 +1508,28 @@ DISABLE_INDEX = NO
# to work a browser that supports JavaScript, DHTML, CSS and frames is required
# (i.e. any modern browser). Windows users are probably better off using the
# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.
GENERATE_TREEVIEW = NO
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR = NO
+
# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
# doxygen will group on one line in the generated HTML documentation.
#
@@ -1462,6 +1554,24 @@ TREEVIEW_WIDTH = 250
EXT_LINKS_IN_WINDOW = NO
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT = png
+
# Use this tag to change the font size of LaTeX formulas included as images in
# the HTML documentation. When you change the font size after a successful
# doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1482,8 +1592,14 @@ FORMULA_FONTSIZE = 10
FORMULA_TRANSPARENT = YES
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE =
+
# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
# installed or if you want to formulas look prettier in the HTML output. When
# enabled you may also need to install MathJax separately and configure the path
@@ -1493,11 +1609,29 @@ FORMULA_TRANSPARENT = YES
USE_MATHJAX = NO
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION = MathJax_2
+
# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
# The default value is: HTML-CSS.
# This tag requires that the tag USE_MATHJAX is set to YES.
@@ -1510,22 +1644,29 @@ MATHJAX_FORMAT = HTML-CSS
# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
# Content Delivery Network so you can quickly see the result without installing
# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
# This tag requires that the tag USE_MATHJAX is set to YES.
MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
# This tag requires that the tag USE_MATHJAX is set to YES.
MATHJAX_EXTENSIONS =
# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
# example see the documentation.
# This tag requires that the tag USE_MATHJAX is set to YES.
@@ -1553,7 +1694,7 @@ MATHJAX_CODEFILE =
SEARCHENGINE = YES
# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
# setting. When disabled, doxygen will generate a PHP script for searching and
# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1713,8 @@ SERVER_BASED_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
#
# See the section "External Indexing and Searching" for details.
# The default value is: NO.
@@ -1585,8 +1727,9 @@ EXTERNAL_SEARCH = NO
#
# Doxygen ships with an example indexer (doxyindexer) and search engine
# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
# This tag requires that the tag SEARCHENGINE is set to YES.
SEARCHENGINE_URL =
@@ -1637,21 +1780,35 @@ LATEX_OUTPUT = latex
# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
# invoked.
#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_CMD_NAME = latex
# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
# The default file is: makeindex.
# This tag requires that the tag GENERATE_LATEX is set to YES.
MAKEINDEX_CMD_NAME = makeindex
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD = makeindex
+
# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
# documents. This may be useful for small projects and may help to save some
# trees in general.
@@ -1681,29 +1838,31 @@ PAPER_TYPE = a4
EXTRA_PACKAGES = amsmath
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_HEADER =
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
# This tag requires that the tag GENERATE_LATEX is set to YES.
LATEX_FOOTER =
@@ -1736,9 +1895,11 @@ LATEX_EXTRA_FILES =
PDF_HYPERLINKS = YES
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
# The default value is: YES.
# This tag requires that the tag GENERATE_LATEX is set to YES.
@@ -1746,8 +1907,7 @@ USE_PDFLATEX = YES
# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# if errors occur, instead of asking the user for help.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.
@@ -1760,16 +1920,6 @@ LATEX_BATCHMODE = NO
LATEX_HIDE_INDICES = NO
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE = NO
-
# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
# bibliography, e.g. plainnat, or ieeetr. See
# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1786,6 +1936,14 @@ LATEX_BIB_STYLE = plain
LATEX_TIMESTAMP = NO
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY =
+
#---------------------------------------------------------------------------
# Configuration options related to the RTF output
#---------------------------------------------------------------------------
@@ -1825,9 +1983,9 @@ COMPACT_RTF = NO
RTF_HYPERLINKS = NO
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
#
# See also section "Doxygen usage" for information on how to generate the
# default style sheet that doxygen normally uses.
@@ -1836,22 +1994,12 @@ RTF_HYPERLINKS = NO
RTF_STYLESHEET_FILE =
# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
# This tag requires that the tag GENERATE_RTF is set to YES.
RTF_EXTENSIONS_FILE =
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE = NO
-
#---------------------------------------------------------------------------
# Configuration options related to the man page output
#---------------------------------------------------------------------------
@@ -1923,6 +2071,13 @@ XML_OUTPUT = xml
XML_PROGRAMLISTING = YES
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
#---------------------------------------------------------------------------
# Configuration options related to the DOCBOOK output
#---------------------------------------------------------------------------
@@ -1941,15 +2096,6 @@ GENERATE_DOCBOOK = NO
DOCBOOK_OUTPUT = docbook
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
#---------------------------------------------------------------------------
# Configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------
@@ -2124,34 +2270,10 @@ EXTERNAL_GROUPS = YES
EXTERNAL_PAGES = YES
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH = /usr/bin/perl
-
#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH =
-
# You can include diagrams made with dia in doxygen documentation. Doxygen will
# then run dia to produce the diagram and insert it in the documentation. The
# DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2208,11 +2330,14 @@ DOT_FONTSIZE = 10
DOT_FONTPATH =
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
CLASS_GRAPH = YES
@@ -2249,10 +2374,32 @@ UML_LOOK = NO
# but if the number exceeds 15, the total amount of fields shown is limited to
# 10.
# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
UML_LIMIT_NUM_FIELDS = 10
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD = 17
+
# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
# collaboration graphs will show the relations between templates and their
# instances.
@@ -2319,6 +2466,13 @@ GRAPHICAL_HIERARCHY = YES
DIRECTORY_GRAPH = YES
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH = 1
+
# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
# generated by dot. For an explanation of the image formats see the section
# output formats in the documentation of the dot tool (Graphviz (see:
@@ -2372,10 +2526,10 @@ MSCFILE_DIRS =
DIAFILE_DIRS =
# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
PLANTUML_JAR_PATH =
@@ -2437,14 +2591,18 @@ DOT_MULTI_TARGETS = YES
# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
# explaining the meaning of the various boxes and arrows in the dot generated
# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
# The default value is: YES.
# This tag requires that the tag HAVE_DOT is set to YES.
GENERATE_LEGEND = YES
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
DOT_CLEANUP = YES
diff --git a/docs/user.conf b/docs/user.conf
new file mode 100644
index 000000000..c39f53a38
--- /dev/null
+++ b/docs/user.conf
@@ -0,0 +1,2634 @@
+# Doxyfile 1.9.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME = "ALP User Documentation"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER = 0.7.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = "Algebraic Programming User Documentation"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = docs/user
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES = NO
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF = YES
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE = 0
+
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES = YES
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES = NO
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST = NO
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if ... \endif and \cond
+# ... \endcond blocks.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT = include/graphblas.hpp \
+ include/graphblas/base \
+ include/graphblas/algorithms \
+ include/graphblas/interfaces \
+ include/transition \
+ include/graphblas/descriptors.hpp \
+ include/graphblas/semiring.hpp \
+ include/graphblas/monoid.hpp \
+ include/graphblas/iomode.hpp \
+ include/graphblas/ops.hpp \
+ include/graphblas/descriptors.hpp \
+ include/graphblas/rc.hpp \
+ include/graphblas/reference/config.hpp \
+ include/graphblas/nonblocking/config.hpp \
+ include/graphblas/bsp1d/config.hpp \
+ include/graphblas/identities.hpp \
+ include/graphblas/phase.hpp \
+ include/graphblas/type_traits.hpp \
+ include/graphblas/backends.hpp \
+ include/graphblas/blas0.hpp #\
+# include/graphblas/utils \
+# include/graphblas/utils.hpp
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS = *.hpp \
+ *.cpp \
+ *.h \
+ *.c
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE = include/graphblas/base/alloc.hpp \
+ include/graphblas/base/coordinates.hpp \
+ include/graphblas/base/distribution.hpp \
+ include/graphblas/base/internalops.hpp \
+ include/graphblas/algorithms/hpcg #\
+# include/graphblas/base/init.hpp
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS = internal
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH = examples/sp.cpp \
+ include/graphblas/ops.hpp \
+ include/graphblas/internalops.hpp
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+#
+#
+# where is the value of the INPUT_FILTER tag, and is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW = NO
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX = YES
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use + S
+# (what the is depends on the OS and browser, but it is typically
+# , /, or both). Inside the search box use the to jump into the search results window, the results can be navigated
+# using the . Press to select an item or to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing +. Also here use the
+# to select a filter and or to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES = amsmath
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED = __DOXYGEN__
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH = YES
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP = YES
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 458eadbcb..c1268623b 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -21,8 +21,9 @@
# and definitions to compile against each backend, but MUST explicitly
# set a default backend (if they want to do so).
#
-assert_defined_variables( REFERENCE_INCLUDE_DEFS REFERENCE_OMP_INCLUDE_DEFS LPF_INCLUDE_DEFS
- WITH_REFERENCE_BACKEND_HEADERS WITH_OMP_BACKEND_HEADERS WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
+assert_defined_variables( REFERENCE_INCLUDE_DEFS REFERENCE_OMP_INCLUDE_DEFS NONBLOCKING_INCLUDE_DEFS LPF_INCLUDE_DEFS
+ WITH_REFERENCE_BACKEND_HEADERS WITH_OMP_BACKEND_HEADERS WITH_NONBLOCKING_BACKEND WITH_BSP1D_BACKEND WITH_HYBRID_BACKEND
+ HYPERDAGS_INCLUDE_DEFS WITH_HYPERDAGS_BACKEND_HEADERS WITH_HYPERDAGS_BACKEND
)
assert_valid_variables( INCLUDE_INSTALL_DIR NO_NUMA_DEF )
@@ -41,7 +42,7 @@ set( HEADERS_REGEX ".+\.(hpp|h|hxx|hh|h\\+\\+)$" )
# to avoid flaky acrobatics with regex or glob expressions, copy main files directly
install( FILES "graphblas.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" )
set( root_files
- "graphblas.hpp" "graphblas/backends.hpp" "graphblas/benchmark.hpp"
+ "graphblas/backends.hpp" "graphblas/benchmark.hpp"
"graphblas/blas0.hpp" "graphblas/blas1.hpp" "graphblas/blas2.hpp"
"graphblas/blas3.hpp" "graphblas/collectives.hpp" "graphblas/config.hpp"
"graphblas/coordinates.hpp" "graphblas/descriptors.hpp" "graphblas/distribution.hpp"
@@ -104,7 +105,6 @@ install( TARGETS alp_utils_headers EXPORT GraphBLASTargets
INCLUDES DESTINATION "${INCLUDE_INSTALL_DIR}"
)
-
if( WITH_REFERENCE_BACKEND_HEADERS )
add_library( backend_reference_headers INTERFACE )
target_link_libraries( backend_reference_headers INTERFACE backend_headers_nodefs )
@@ -137,9 +137,34 @@ if( WITH_OMP_BACKEND_HEADERS )
FILES_MATCHING REGEX "${HEADERS_REGEX}"
)
install( TARGETS backend_reference_omp_headers EXPORT GraphBLASTargets )
+endif()
+if( WITH_HYPERDAGS_BACKEND )
+ add_library( backend_hyperdags_headers INTERFACE )
+ target_link_libraries( backend_hyperdags_headers INTERFACE "backend_${WITH_HYPERDAGS_USING}_headers" )
+ target_compile_definitions( backend_hyperdags_headers INTERFACE "${HYPERDAGS_INCLUDE_DEFS}" )
+ install( TARGETS backend_hyperdags_headers EXPORT GraphBLASTargets )
+ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/hyperdags/"
+ DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/hyperdags"
+ FILES_MATCHING REGEX "${HEADERS_REGEX}"
+ )
endif()
+if( WITH_NONBLOCKING_BACKEND )
+ add_library( backend_nonblocking_headers INTERFACE )
+ # the nonblocking backend depends on the reference backend
+ target_link_libraries( backend_nonblocking_headers INTERFACE backend_reference_headers )
+ target_link_libraries( backend_nonblocking_headers INTERFACE OpenMP::OpenMP_CXX )
+ target_compile_definitions( backend_nonblocking_headers INTERFACE
+ "${NONBLOCKING_INCLUDE_DEFS}"
+ )
+
+ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/nonblocking/"
+ DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/nonblocking"
+ FILES_MATCHING REGEX "${HEADERS_REGEX}"
+ )
+ install( TARGETS backend_nonblocking_headers EXPORT GraphBLASTargets )
+endif()
if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND )
# copy headers, which are common to both distributed backends
@@ -187,6 +212,11 @@ install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/algorithms/"
FILES_MATCHING REGEX "${HEADERS_REGEX}"
)
+install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/interfaces/"
+ DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/interfaces"
+ FILES_MATCHING REGEX "${HEADERS_REGEX}"
+)
+
install( TARGETS algorithms EXPORT GraphBLASTargets )
# this target lists the transition path headers
diff --git a/include/graphblas.hpp b/include/graphblas.hpp
index 64df3b9e4..a539a5c0d 100644
--- a/include/graphblas.hpp
+++ b/include/graphblas.hpp
@@ -15,53 +15,178 @@
* limitations under the License.
*/
-/*
- * @author: A. N. Yzelman.
- * @date: 8th of August, 2016.
+/**
+ * @file
+ *
+ * The main header to include in order to use the ALP/GraphBLAS API.
+ *
+ * @author A. N. Yzelman.
+ * @date 8th of August, 2016.
*/
/** \mainpage
*
- * This is a GraphBLAS implementation in ANSI C++11. Authors:
- * -# A. N. Yzelman, Huawei Technologies France; 2016-2020.
- * -# A. N. Yzelman, Huawei Technologies Switzerland AG; 2020-current.
- * -# Aristeidis Mastoras, Huawei Technologies Switzerland AG; 2020-current.
- * -# Alberto Scolari, Huawei Technologies Switzerland AG; 2021-current.
- * -# Verner Vlacic, Huawei Technologies Switzerland AG; 2021-current.
- * -# Auke Booij, Huawei Technologies Switzerland AG; 2021.
- * -# Dan Iorga, Huawei Technologies Switzerland AG; 2021.
- * -# Daniel Di Nardo, Huawei Technologies France; 2017.
- * -# Jonathan M. Nash, Huawei Technologies France; 2017.
+ * The Algebraic Programming (ALP) project is a modern and humble C++
+ * programming framework that achieves scalable and high performance.
+ *
+ * With ALP, programmers are encouraged to express programs using algebraic
+ * concepts directly. ALP is a humble programming model in that it hides all
+ * optimisations pertaining to parallelisation, vectorisation, and other
+ * complexities with programming large-scale and heterogeneous systems.
+ *
+ * ALP presently exposes the following interfaces:
+ * -# generalised sparse linear algebra, \ref GraphBLAS;
+ * -# vertex-centric programming, \ref Pregel.
+ *
+ * Several other programming interfaces are under design at present.
*
- * Contact: albertjan.yzelman@huawei.com
+ * For authors who contributed to ALP, please see the NOTICE file.
*
- * This API exposes only two containers: a #grb::Vector and a #grb::Matrix.
+ * Contact:
+ * - https://github.com/Algebraic-Programming/ALP
+ * - https://gitee.com/CSL-ALP/graphblas/
+ * - albertjan.yzelman@huawei.com
*
- * All primitives defined on these containers must be given a (binary)
- * operator, a #grb::Monoid, or a #grb::Semiring. These monoid and semiring are
- * generalised from their mathematical counterpart in that they holds multiple
- * domains. The monoid consists of one binary operator and a corresponding
- * identity. The semiring consists of one additive operator, one multiplicative
- * operator, one identity under addition, and one identity under multiplication.
+ * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ *
+ * \defgroup GraphBLAS ALP/GraphBLAS
+ * @{
+ *
+ * @brief ALP/GraphBLAS enables sparse linear algebraic programming.
+ *
+ * \parblock
+ * \par API introduction
+ *
+ * ALP/GraphBLAS is an ANSI C++11 variant of the C GraphBLAS standard with a few
+ * different choices and an emphasis on portability and auto-parallelisation. It
+ * exposes only two containers: #grb::Vector and #grb::Matrix. A template
+ * argument controls the type of the values contained within a container.
+ *
+ * A container may have between \f$ 0 \f$ and \f$ c \f$ values, and each such
+ * value has a coordinate. The value \f$ c \f$ is the \em capacity of a
+ * container, and at most equals the \em size of that container. The size of a
+ * matrix is the product of its number of rows and its number of columns.
+ * Containers with fewer values than their size are considered \em sparse, while
+ * those with as many values as their size are considered \em dense. Scalars
+ * correspond to the standard C++ plain-old-data types, and, as such, have size,
+ * capacity, and number of values equal to one-- scalars are always dense.
+ *
+ * For matrices, their size can be derived from #grb::nrows and #grb::ncols,
+ * while for vectors their size may be immediately retrieved via #grb::size.
+ * For both vectors and matrices, their capacity and current number of values
+ * may be retrieved via #grb::capacity and #grb::nnz, respectively. Finally,
+ * containers have a unique identifier that may be retrieved via #grb::getID.
+ * These identifiers are assigned in a deterministic fashion, so that for
+ * deterministic programs executed with the same number of processes, the same
+ * containers will be assigned the same IDs.
+ *
+ * Containers may be populated using #grb::set or by using dedicated I/O
+ * routines such as #grb::buildVectorUnique or #grb::buildMatrixUnique. Here,
+ * \em unique refers to the collection of values that should be ingested having
+ * no duplicate coordinates; i.e., there are no two values that map to the same
+ * coordinate. The first argument to either function is the output container,
+ * which is followed by an iterator pair that points to a collection of values
+ * to be ingested into the output container.
+ *
+ * ALP/GraphBLAS supports multiple user processes \f$ P \f$. If \f$ P > 1 \f$,
+ * there is a difference between #grb::SEQUENTIAL and #grb::PARALLEL I/O. The
+ * default I/O mode is #grb::PARALLEL, which may be overridden by supplying
+ * #grb::SEQUENTIAL as a fourth and final argument to the input routines. In
+ * sequential I/O, the iterator pair must point to the exact same collection
+ * of input values on each of the \f$ P \f$ user processes. In the parallel
+ * mode, however, each iterator pair points to disjoint value sets at each of
+ * the processes, while their union is what is logically ingested into the
+ * output container.
+ *
+ * Output iteration is done using the standard STL-style iterators. ALP,
+ * however, only supports const_iterators on output. Output iterators default
+ * to sequential mode also.
+ *
+ * Primitives perform algebraic operations on containers while using explicitly
+ * supplied algebraic structures. Primitives may be as simple as the
+ * element-wise application of a binary operator to two input vectors,
+ * generating values in a third output vector (\f$ z = x \odot y \f$,
+ * #grb::eWiseApply), or may be as rich as multiplying two matrices together
+ * whose result is to be added in-place to a third matrix
+ * (\f$ C \leftarrow C + AB \f$, #grb::mxm). The latter is typically deemed
+ * richer since it requires a semiring structure rather than a more basic binary
+ * operator.
+ *
+ * Primitives are grouped according to their classical BLAS levels:
+ * - \ref BLAS0
+ * - \ref BLAS1
+ * - \ref BLAS2
+ * - \ref BLAS3
+ *
+ * The "level-0" primitives operate on scalars, and in terms of arithmetic
+ * intensity match those of level-1 primitives-- however, since standard BLAS
+ * need not define scalar operations this specification groups them separately.
+ * All primitives except for #grb::set and #grb::eWiseApply are \em in-place,
+ * meaning that new output values are "added" to any pre-existing contents in
+ * output containers. The operator used for addition is derived from the
+ * algebraic structure that the primitive is called with.
+ *
+ * ALP requires that every primitive is \em parallelisable. Every backend that
+ * implements primitive for a specific system furthermore must specify
+ * performance semantics . Contrary to functional semantics that this
+ * reference specifies, performance semantics guarantee certain observable
+ * behaviours when it comes to the amount of work, data movement,
+ * synchronisation across parallel systems, and/or memory use.
+ *
+ * @see perfSemantics
+ * \endparblock
+ *
+ * \parblock
+ * \par Algebraic Structures
+ *
+ * ALP/GraphBLAS defines three types of algebra structures, namely, a
+ * -# binary operator such as #grb::operators::add (numerical addition),
+ * -# #grb::Monoid, and
+ * -# #grb::Semiring.
+ *
+ * Binary operators are parametrised in two input domains and one output domain,
+ * \f$ D_1 \times D_2 \to D_3 \f$. The \f$ D_i \f$ are given as template
+ * arguments to the operator. A #grb::Monoid is composed from a binary operator
+ * coupled with an identity. For example, the additive monoid is defined as
+ * \code
+ * grb::Monoid<
+ * grb::operators::add< double >,
+ * grb::identities::zero
+ * >
+ * \endcode
+ * Note that passing a single domain as a template argument to a binary operator
+ * is a short-hand for an operator with \f$ D_{\{1,2,3\}} \f$ equal to the same
+ * domain.
+ *
+ * Likewise, a #grb::Semiring is composed from two monoids, where the first,
+ * the so-called additive monoid, furthermore must be commutative. The classic
+ * semiring over integers taught in elementary school, for example, reads
+ * \code
+ * grb::Semiring<
+ * grb::operators::add< unsigned int >,
+ * grb::operators::mul< unsigned int >,
+ * grb::identities::zero,
+ * grb::identities::one
+ * >
+ * \endcode
*
* Monoids and semirings must comply with their regular axioms-- a type system
* assists users by checking for incorrect operators acting as additive or
- * multiplicative operators. Standard operators and identities are found in
- * their respective namespaces, #grb::operators and #grb::identities,
- * respectively.
+ * multiplicative monoids. Errors are reported at compile time , through
+ * the use of algebraic type traits such as #grb::is_associative.
*
- * Monoids and semirings must be supplied with the domain(s) it will operate
- * on. These must be available at compile time. Also the element type of
- * GraphBLAS containers must be set at compile time. The size of a container is
- * set at run-time, but may not change during its life time.
+ * @see typeTraits
*
- * This implementation provides various \ref BLAS1 and \ref BLAS2 primitives. To
- * simplify writing generalised algebraic routines, it also provides \ref BLAS0
- * primitives.
+ * Standard operators and identities are found in their respective namespaces,
+ * #grb::operators and #grb::identities, respectively. The ALP monoids and
+ * semirings are generalised from their standard mathematical definitions in
+ * that they hold multiple domains. The description of #grb::Semiring details
+ * the underlying mathematical structure that nevertheless can be identified.
+ * \endparblock
*
- * The three aforementioned ingredients, namely, containers, algebraic relations
- * (such as semirings), and level-{1,2,3} primitives make up the full interface
- * of this DSL.
+ * \parblock
+ * \par ALP/GraphBLAS by example
*
* An example is provided within examples/sp.cpp. It demonstrates usage of this
* API. We now follow with some code snippets from that example. First, the
@@ -104,64 +229,249 @@
* Full example use case:
*
* \snippet sp.cpp Example shortest-paths with semiring adapted to find the most reliable route instead
+ * \endparblock
+ *
+ * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
+ *
+ * \defgroup typeTraits Algebraic Type Traits
+ * @{
*
- * Any GraphBLAS code may execute using any of the backends this implementation
- * defines. Currently, the following backends are stable:
+ * Algebraic type traits allows compile-time reasoning on algebraic structures.
+ *
+ * Under algebraic type traits , ALP defines two classes of type traits:
+ * 1. classical type traits, akin to, e.g., std::is_integral , defined
+ * over the ALP-specific algebraic objects such as #grb::Semiring, and
+ * 2. algebraic type traits that allow for the compile-time introspection of
+ * algebraic structures.
+ *
+ * Under the first class, the following type traits are defined by ALP:
+ * - #grb::is_operator, #grb::is_monoid, and #grb::is_semiring, but also
+ * - #grb::is_container and #grb::is_object.
+ *
+ * Under the second class, the following type traits are defined by ALP:
+ * - #grb::is_associative, #grb::is_commutative, #grb::is_idempotent, and
+ * #grb::has_immutable_nonzeroes.
+ *
+ * Algebraic type traits are a central concept to ALP; depending on algebraic
+ * properties, ALP applies different optimisations. Properties such as
+ * associativity furthermore often define whether primitives may be
+ * automatically parallelised. Therefore, some primitives only allow algebraic
+ * structures with certain properties.
+ *
+ * Since algebraic type traits are compile-time, the composition of invalid
+ * structures (e.g., composing a monoid out of a non-associative binary
+ * operator), or the calling of a primitive using an incompatible algebraic
+ * structure, results in an compile-time error. Such errors are
+ * furthermore accompanied by clear messages and suggestions.
+ *
+ * @}
+ *
+ * \defgroup backends Backends
+ * @{
+ *
+ * ALP code is compiled using a compiler wrapper, which optionally takes a
+ * backend parameter as an argument. The backend selection controls for which
+ * use case the code is compiled. Options that are always included are:
* -# #grb::reference, a single-process, auto-vectorising, sequential backend;
* -# #grb::reference_omp, a single-process, auto-parallelising, shared-memory
* parallel backend based on OpenMP and the aforementioned vectorising
* backend;
+ * -# grb::hyperdags, a backend that captures the meta-data of computations
+ * while delegating the actual work to the #grb::reference backend. At
+ * program exit, the #grb::hyperdags backend dumps a HyperDAG of the
+ * computations performed.
+ *
+ * Additionally, the following backends may be enabled by providing their
+ * dependences before building ALP:
* -# #grb::BSP1D, an auto-parallelising, distributed-memory parallel
* backend based on the Lightweight Parallel Foundations (LPF). This is a
* multi-process backend and may rely on any single-process backend for
- * process-local computations. Its combination with the #grb::reference_omp
+ * process-local computations, which by default is #grb::reference.
+ * Distributed-memory auto-parallelisation is achieved using a row-wise
+ * one-dimensional block-cyclic distributon.
+ * Its combination with the #grb::reference_omp
* backend results in a fully hybrid shared- and distributed-memory
* GraphBLAS implementation.
- *
- * Backends that are currently under development:
+ * -# #grb::hybrid, essentially the same backend as #grb::BSP1D, but now
+ * composed with the #grb::reference_omp backend for process-local
+ * computations. This backend facilitates full hybrid shared- and
+ * distributed-memory parallelisation.
* -# #grb::banshee, a single-process, reference-based backend for the Banshee
* RISC-V hardware simulator making use of indirection stream semantic
- * registers (ISSR, in collaboration with Prof. Benini at ETHZ);
+ * registers (ISSR). Written by Dan Iorga in collaboration with ETHZ. This
+ * backend is outdated, but, last tested, remained functional.
+ *
+ * The #grb::Backend enum lists all backends known to ALP. Properties of a
+ * backend that may affect more advanced user code are collected in
+ * #grb::Properties.
*
- * @author A. N. Yzelman, Huawei Technologies France (2016-2020)
* @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
+ *
+ * \defgroup perfSemantics Performance Semantics
+ * @{
+ *
+ * Each ALP primitive, every constructor, and every destructor come with
+ * performance semantics , in addition to functional semantics.
+ *
+ * Performance semantics may differ for different backends-- ALP stringently
+ * mandates that backends defines them, thus imposing a significant degree of
+ * predictability on implementations of ALP, but does not significantly limit
+ * possible implementation choices.
+ *
+ * \warning Performance semantics should not be mistaken for performance
+ * \em guarantees. The vast majority of computing platforms exhibit
+ * performance variabilities that preclude defining stringent such
+ * guarantees.
+ *
+ * Performance semantics includes classical asymptotic work analysis in the
+ * style of Cormen et alii, as commonly taught as part of basic computer science
+ * courses. Aside from making the reasonable (although arguably too uncommon)
+ * demand that ALP libraries must clearly document the work complexity of the
+ * primitives it defines, ALP furthermore demands such analyses for the
+ * following quantities:
+ * - how many times operator(s) may be applied,
+ * - intra-process data movement from main memory to processing units,
+ * - new dynamic memory allocations and/or releases of previously allocated
+ * memory, and
+ * - whether system calls may occur during a call to the given primitive.
+ *
+ * \note Typically (but not always) the amount of work is proportional to the
+ * number of operator applications.
+ *
+ * \note Typically (but not necessarily always) if primitives are allowed to
+ * allocate or free dynamic memory, then it may also thus make system
+ * calls.
+ *
+ * For backends that allow for more than one user process, the following
+ * additional performance semantics must be defined:
+ * - inter-process data movement, and
+ * - how many synchronisation steps a primitive requires to complete.
+ *
+ * Defining such performance semantics are crucial to
+ * 1. allow algorithm designers to design the best possible algorithms even if
+ * the target platforms and target use cases vary,
+ * 2. allow users to determine scalability under increasing problem sizes, and
+ * 3. allow system architects to determine the qualitative effect of scaling up
+ * system resources in an a-priori fashion.
+ *
+ * These advantages furthermore do not require expensive experimentation on the
+ * part of algorithm designers, users, or system architects. However, it puts a
+ * significant demand on the implementers and maintainers of ALP.
+ *
+ * @see backends
+ *
+ * @author A. N. Yzelman, Huawei Technologies Switzerland AG (2020-current)
+ * @}
*/
#ifdef __DOXYGEN__
+
/**
- * Define this macro to disable libnuma use.
+ * Define this macro to disable the dependence on libnuma.
+ *
+ * \warning Defining this macro is discouraged and not tested thoroughly.
+ *
+ * \note The CMake bootstrap treats libnuma as a non-optional dependence.
*/
#define _GRB_NO_LIBNUMA
/**
+ * \internal
* Define this macro to disable thread pinning.
+ * \todo Make sure this macro is taken into account for backends that perform
+ * automatic pinning.
+ * \endinternal
*/
#define _GRB_NO_PINNING
/**
- * Defie this macro to compile with PlatformBSP support.
+ * Define this macro to turn off standard input/output support.
+ *
+ * \warning This macro has only been fully supported within the #grb::banshee
+ * backend, where neither standard iostream nor
+ * stdio.h were available. If support through the full ALP
+ * implementation would be useful, please raise an issue through
+ * GitHub or Gitee so that we may consider and plan for supporting
+ * this macro more fully.
*/
-#define _GRB_WITH_LPF
+#define _GRB_NO_STDIO
/**
- * Which GraphBLAS backend should be default.
+ * Define this macro to turn off reliance on standard C++ exceptions.
+ *
+ * \deprecated Support for this macro is being phased out.
+ *
+ * \note Its intended use is to support ALP/GraphBLAS deployments on platforms
+ * that do not support C++ exceptions, such as some older Android SDK
+ * applications.
*
- * Known single user-process options:
- * -# reference
- * -# reference_omp
+ * \warning The safe usage of ALP/GraphBLAS while exceptions are disabled
+ * relies, at present, on the inspection of internal states and the
+ * usage of internal functions. We have no standardised exception-free
+ * way of using ALP/GraphBLAS at present and have no plans to
+ * (continue and/or extend) support for it.
+ */
+#define _GRB_NO_EXCEPTIONS
+
+/**
+ * Define this macro to compile with LPF support.
+ *
+ * \note The CMake bootstrap automatically defines this flag when a valid LPF
+ * installation is found. This flag is also defined by the ALP/GraphBLAS
+ * compiler wrapper whenever an LPF-enabled backend is selected.
+ */
+#define _GRB_WITH_LPF
+
+/**
+ * \internal
+ * Which ALP/GraphBLAS backend should be the default.
*
- * Known multiple user-process options:
- * -# BSP1D
+ * This flag is overridden by the compiler wrapper, and it is set by the base
+ * config.hpp header.
+ * \endinternal
*/
#define _GRB_BACKEND reference
/**
- * Which GraphBLAS backend the BSP1D backend should use within a single user
- * process. For possible values, see the single user process options for
- * #_GRB_BACKEND.
+ * Which ALP/GraphBLAS backend the BSP1D backend should use for computations
+ * within a single user process. The ALP/GraphBLAS compiler wrapper sets this
+ * value automatically depending on the choice of backend-- compare, e.g., the
+ * #grb::BSP1D backend versus the #grb::hybrid backend.
*/
#define _GRB_BSP1D_BACKEND
-#endif
+
+/**
+ * The ALP/GraphBLAS namespace.
+ *
+ * All ALP/GraphBLAS primitives, container types, algebraic structures, and type
+ * traits are defined within.
+ */
+namespace grb {
+
+ /**
+ * The namespace for ALP/GraphBLAS algorithms.
+ */
+ namespace algorithms {
+
+ /**
+ * The namespace for ALP/Pregel algorithms.
+ */
+ namespace pregel {}
+
+ }
+
+ /**
+ * The namespace for programming APIs that automatically translate to
+ * ALP/GraphBLAS.
+ */
+ namespace interfaces {}
+
+}
+
+#endif // end ``#ifdef __DOXYGEN__''
#ifndef _H_GRAPHBLAS
#define _H_GRAPHBLAS
diff --git a/include/graphblas/algorithms/bicgstab.hpp b/include/graphblas/algorithms/bicgstab.hpp
index a4f338156..289a53554 100644
--- a/include/graphblas/algorithms/bicgstab.hpp
+++ b/include/graphblas/algorithms/bicgstab.hpp
@@ -15,11 +15,17 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the BiCGstab algorithm.
+ *
* @author A. N. Yzelman
* @date 15th of February, 2022
*
- * Implementation time, to be taken with a pinch of salt:
+ * \par Implementation time
+ *
+ * To be taken with a pinch of salt, as it is highly subjective:
* - 50 minutes, excluding error handling, documentation, and testing.
* - 10 minutes to get it to compile, once the smoke test was generated.
* - 15 minutes to incorporate proper error handling plus printing of warnings
@@ -87,12 +93,12 @@ namespace grb {
*
* Additional outputs of this algorithm:
*
- * @param[out] iterations When #grb::SUCCESS is returned, the number of
- * iterations that were required to obtain an
- * acceptable approximate solution.
- * @param[out] residual When #grb::SUCCESS is returned, the square of the
- * 2-norm of the residual; i.e., \f$ (r,r) \f$,
- * where \f$ r = b - Ax \f$.
+ * @param[out] iterations When #grb::SUCCESS is returned, the number of
+ * iterations that were required to obtain an
+ * acceptable approximate solution.
+ * @param[out] residual When #grb::SUCCESS is returned, the square of the
+ * 2-norm of the residual; i.e., \f$ (r,r) \f$,
+ * where \f$ r = b - Ax \f$.
*
* To operate, this algorithm requires a workspace consisting of six vectors
* of length and capacity \f$ n \f$. If vectors with less capacity are passed
@@ -100,6 +106,18 @@ namespace grb {
*
* @param[in] r, rhat, p, v, s, t Workspace vectors required for BiCGstab.
*
+ * The BiCGstab operates over a field defined by the following algebraic
+ * structures:
+ *
+ * @param[in] semiring Defines the domains as well as the additive and the
+ * multicative monoid.
+ * @param[in] minus The inverse of the additive operator.
+ * @param[in] divide The inverse of the multiplicative operator.
+ *
+ * \note When compiling with the _DEBUG macro defined, the print-out
+ * statements require sqrt as an additional algebraic concept.
+ * This concept presently lives "outside" of ALP.
+ *
* Valid descriptors to this algorithm are:
* -# descriptors::no_casting
* -# descriptors::transpose
@@ -120,6 +138,7 @@ namespace grb {
* output as well as the state of ALP/GraphBLAS is
* undefined.
*
+ * \parblock
* \par Performance semantics
*
* -# This function does not allocate nor free dynamic memory, nor shall it
@@ -130,8 +149,10 @@ namespace grb {
* the specification of the ALP primitives this function relies on. These
* performance semantics, with the exception of getters such as #grb::nnz, are
* specific to the backend selected during compilation.
+ * \endparblock
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
typename IOType, typename NonzeroType, typename InputType,
typename ResidualType,
class Semiring = Semiring<
@@ -142,7 +163,8 @@ namespace grb {
class Minus = operators::subtract< ResidualType >,
class Divide = operators::divide< ResidualType >
>
- RC bicgstab( grb::Vector< IOType > &x,
+ RC bicgstab(
+ grb::Vector< IOType > &x,
const grb::Matrix< NonzeroType > &A,
const grb::Vector< InputType > &b,
const size_t max_iterations,
@@ -330,13 +352,13 @@ namespace grb {
// p = r + beta ( p - omega * v )
ret = ret ? ret : eWiseLambda(
- [&r,&beta,&p,&v,&omega,&semiring,&minus] (const size_t i) {
+ [&r,beta,&p,&v,omega,&semiring,&minus] (const size_t i) {
InputType tmp;
apply( tmp, omega, v[i], semiring.getMultiplicativeOperator() );
foldl( p[ i ], tmp, minus );
foldr( beta, p[ i ], semiring.getMultiplicativeOperator() );
foldr( r[ i ], p[ i ], semiring.getAdditiveOperator() );
- }, v, b
+ }, v, p, r
);
// v = Ap
@@ -371,9 +393,10 @@ namespace grb {
// check residual
residual = zero;
ret = ret ? ret : dot< dense_descr >( residual, s, s, semiring );
- assert( residual > zero ); // we just assert this one rather than checking for it
+ assert( residual > zero );
#ifdef _DEBUG
- std::cout << "\t\t running residual, pre-stabilisation: " << sqrt(residual) << "\n";
+ std::cout << "\t\t running residual, pre-stabilisation: " << sqrt(residual)
+ << "\n";
#endif
if( ret == SUCCESS && residual < tol ) {
// update result (x += alpha * p) and exit
@@ -400,7 +423,7 @@ namespace grb {
#ifdef _DEBUG
std::cout << "\t\t (t, t) = " << omega << "\n";
#endif
- assert( omega > zero ); // we just assert this one rather than checking for it
+ assert( omega > zero );
ret = ret ? ret : foldr( temp, omega, divide );
#ifdef _DEBUG
std::cout << "\t\t omega = " << omega << "\n";
@@ -421,9 +444,10 @@ namespace grb {
// check residual
residual = zero;
ret = ret ? ret : dot< dense_descr >( residual, r, r, semiring );
- assert( residual > zero ); // we just assert this one rather than checking for it
+ assert( residual > zero );
#ifdef _DEBUG
- std::cout << "\t\t running residual, post-stabilisation: " << sqrt(residual) << ". "
+ std::cout << "\t\t running residual, post-stabilisation: "
+ << sqrt(residual) << ". "
<< "Residual squared: " << residual << ".\n";
#endif
if( ret == SUCCESS ) {
diff --git a/include/graphblas/algorithms/conjugate_gradient.hpp b/include/graphblas/algorithms/conjugate_gradient.hpp
index 7ed2f3888..9a68f248e 100644
--- a/include/graphblas/algorithms/conjugate_gradient.hpp
+++ b/include/graphblas/algorithms/conjugate_gradient.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the CG algorithm
+ *
* @author Aristeidis Mastoras
*/
@@ -85,7 +89,7 @@ namespace grb {
* Additional outputs (besides \a x):
*
* @param[out] iterations The number of iterations the algorithm has
- * performed.
+ * started.
* @param[out] residual The residual corresponding to output \a x.
*
* The CG algorithm requires three workspace buffers with capacity \f$ n \f$:
@@ -118,6 +122,17 @@ namespace grb {
* output as well as the state of ALP/GraphBLAS is
* undefined.
*
+ * On output, the contents of the workspace \a r, \a u, and \a temp are
+ * always undefined. For non-#grb::SUCCESS error codes, additional containers
+ * or states may be left undefined:
+ * -# when #grb::PANIC is returned, the entire program state, including the
+ * contents of all containers, become undefined;
+ * -# when #grb::ILLEGAL or #grb::MISMATCH are returned and \a iterations
+ * equals zero, then all outputs are left unmodified compared to their
+ * contents at function entry;
+ * -# when #grb::ILLEGAL or #grb::MISMATCH are returned and \a iterations is
+ * nonzero, then the contents of \a x are undefined.
+ *
* \par Performance semantics
*
* -# This function does not allocate nor free dynamic memory, nor shall it
@@ -234,6 +249,15 @@ namespace grb {
}
}
+ // set pure output fields to neutral defaults
+ iterations = 0;
+ residual = std::numeric_limits< double >::infinity();
+
+ // trivial shortcuts
+ if( max_iterations == 0 ) {
+ return FAILED;
+ }
+
// make x and b structurally dense (if not already) so that the remainder
// algorithm can safely use the dense descriptor for faster operations
{
@@ -283,7 +307,7 @@ namespace grb {
} else {
ret = ret ? ret : grb::dot< descr_dense >( sigma, r, r, ring );
}
-
+
assert( ret == SUCCESS );
// bnorm = b' * b;
@@ -306,6 +330,9 @@ namespace grb {
size_t iter = 0;
do {
+ assert( iter < max_iterations );
+ (void) ++iter;
+
// temp = 0
ret = ret ? ret : grb::set( temp, 0 );
assert( ret == SUCCESS );
@@ -363,7 +390,7 @@ namespace grb {
assert( ret == SUCCESS );
if( ret == SUCCESS ) {
- if( sqrt( residual ) < tol ) {
+ if( sqrt( residual ) < tol || iter >= max_iterations ) {
break;
}
}
@@ -383,17 +410,19 @@ namespace grb {
std::swap( u, temp );
sigma = beta;
+ } while( ret == SUCCESS );
- } while( iter++ < max_iterations && ret == SUCCESS );
-
- // output
+ // output that is independent of error code
iterations = iter;
- if( ret != SUCCESS ) {
- return FAILED;
- } else {
- return SUCCESS;
+ // return correct error code
+ if( ret == SUCCESS ) {
+ if( sqrt( residual ) >= tol ) {
+ // did not converge within iterations
+ return FAILED;
+ }
}
+ return ret;
}
} // namespace algorithms
diff --git a/include/graphblas/algorithms/cosine_similarity.hpp b/include/graphblas/algorithms/cosine_similarity.hpp
index a51f16bc6..6dff53af2 100644
--- a/include/graphblas/algorithms/cosine_similarity.hpp
+++ b/include/graphblas/algorithms/cosine_similarity.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements cosine simularity
+ *
* @author: A. N. Yzelman.
* @date: 13th of December, 2017.
*/
@@ -77,14 +81,16 @@ namespace grb {
* The argument \a div is optional. It will map to grb::operators::divide by
* default.
*
- * @returns SUCCESS If the computation was successful.
- * @returns MISMATCH If the vector sizes do not match. The output
- * \a similarity is undefined.
- * @returns ILLEGAL In case \a x is all zero, and/or when \a y is all zero.
- * The output \a similarity is undefined.
- * @returns PANIC If an unrecoverable error has been encountered. The
- * output as well as the state of ALP/GraphBLAS is
- * undefined.
+ * @returns #grb::SUCCESS If the computation was successful.
+ * @returns #grb::MISMATCH If the vector sizes do not match. The output
+ * \a similarity is untouched -- the call to this
+ * algorithm will have no other effects than returning
+ * #grb::MISMATCH.
+ * @returns #grb::ILLEGAL In case \a x is all zero, and/or when \a y is all zero.
+ * The output \a similarity is undefined.
+ * @returns #grb::PANIC If an unrecoverable error has been encountered. The
+ * output as well as the state of ALP/GraphBLAS is
+ * undefined.
*
* \par Performance semantics
*
@@ -97,7 +103,8 @@ namespace grb {
* performance semantics, with the exception of getters such as #grb::nnz, are
* specific to the backend selected during compilation.
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
typename OutputType,
typename InputType1,
typename InputType2,
@@ -161,14 +168,14 @@ namespace grb {
const auto &mul = ring.getMultiplicativeOperator();
const auto &add = ring.getAdditiveOperator();
OutputType temp;
- (void)grb::apply( temp, x[ i ], y[ i ], mul );
- (void)grb::foldl( nominator, temp, add );
- (void)grb::apply( temp, x[ i ], x[ i ], mul );
- (void)grb::foldl( norm1, temp, add );
- (void)grb::apply( temp, y[ i ], y[ i ], mul );
- (void)grb::foldl( norm2, temp, add );
- },
- x, y );
+ (void) grb::apply( temp, x[ i ], y[ i ], mul );
+ (void) grb::foldl( nominator, temp, add );
+ (void) grb::apply( temp, x[ i ], x[ i ], mul );
+ (void) grb::foldl( norm1, temp, add );
+ (void) grb::apply( temp, y[ i ], y[ i ], mul );
+ (void) grb::foldl( norm2, temp, add );
+ }, x, y
+ );
denominator = sqrt( norm1 ) * sqrt( norm2 );
} else {
// cannot stream each vector once, stream each one twice instead using
diff --git a/include/graphblas/algorithms/kcore_decomposition.hpp b/include/graphblas/algorithms/kcore_decomposition.hpp
new file mode 100644
index 000000000..e17fcc5f3
--- /dev/null
+++ b/include/graphblas/algorithms/kcore_decomposition.hpp
@@ -0,0 +1,296 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the algebraic k-core decomposition algorithm by Li et al.
+ *
+ * @author Anders Hansson
+ * @date January, 2023
+ */
+
+
+#ifndef _H_GRB_KCORE_DECOMPOSITION
+#define _H_GRB_KCORE_DECOMPOSITION
+
+#include
+
+
+namespace grb {
+
+ namespace algorithms {
+
+ /**
+ * The \f$ k \f$-core decomposition algorithm.
+ *
+ * \note This algorithm is smoke-tested using a ground-truth output coreness
+ * vector corresponding to the EPA matrix. However, the ground truth
+ * was generated using an earlier version of this algorithm, run using
+ * an earlier version of ALP/GraphBLAS. This solution was manually
+ * verified against an external algorithm. A better testing methodology
+ * compares against a ground truth generated by such an external
+ * baseline-- see GitHub issue #160, to which contributions would be
+ * warmly received.
+ *
+ * Divides the input matrix into subgraphs with a coreness level. The coreness
+ * level \f$ k \f$ is defined as the largest subgraph in which each node has at
+ * least \f$ k \f$ neighbors in the subgraph.
+ *
+ * @tparam IOType The value type of the \f$ k \f$-core vectors,
+ * usually an integer type.
+ * @tparam NZType The type of the nonzero elements in the matrix.
+ *
+ * @param[in] A Matrix representing a graph with nonzero value at
+ * \f$ (i, j) \f$ an edge between node \f$ i \f$ and
+ * \f$ j \f$.
+ * @param[out] core Empty vector of size and capacity \f$ n \f$. On
+ * output, if #grb::SUCCESS is returned, stores the
+ * coreness level for each node.
+ * @param[out] k The number of coreness lever that was found in the
+ * graph.
+ *
+ * To operate, this algorithm requires a workspace of four vectors. The size
+ * \em and capacities of these must equal \f$ n \f$. The contents on input are
+ * ignored, and the contents on output are undefined. The work space consists
+ * of the buffer vectors \a distances, \a temp, \a update, and \a status.
+ *
+ * @param[in,out] distances Distance buffer
+ * @param[in,out] temp First node update buffer
+ * @param[in,out] update Second node update buffer
+ * @param[in,out] status Finished/unfinished buffer
+ *
+ * @returns #grb::SUCCESS If the coreness for all nodes are found.
+ * @returns #grb::ILLEGAL If \a A is not square. All outputs are left
+ * untouched.
+ * @returns #grb::MISMATCH If the dimensions of \a core or any of the buffer
+ * vectors does not match \a A. All outputs are left
+ * untouched.
+ * @returns #grb::ILLEGAL If the capacity of one or more of \a core and the
+ * buffer vectors is less than \f$ n \f$.
+ * @returns #grb::PANIC If an unrecoverable error has been encountered. The
+ * output as well as the state of ALP/GraphBLAS is
+ * undefined.
+ *
+ * If any non #grb::SUCCESS error code is returned, then the contents of
+ * \a core are undefined, while \a k will be untouched by the algorithm.
+ *
+ * \note For undirected, unweighted graphs, use pattern matrix for \a A;
+ * i.e., use \a NZtype void
+ *
+ * \note For unweighted graphs, IOType should be a form of unsigned integer.
+ * The value of any IOType element will be no more than the maximum
+ * degree found in the graph \a A.
+ *
+ * @tparam criticalSection The original MR had an eWiseLambda-based
+ * implementation that contains a critical section.
+ * This may or may not be faster than a pure
+ * ALP/GraphBLAS implementation, depending also on
+ * which backend is selected. Setting this template
+ * argument true selects the original
+ * eWiseLambda-based implementation, while otherwise
+ * a pure ALP/GraphBLAS implementation takes effect.
+ *
+ * \note In some non-exhaustive experiments, setting \a criticalSection to
+ * false leads to better performance on shared-memory parallel
+ * systems (using #grb::reference_omp).
+ *
+ * \warning Setting \a criticalSection to true is not supported for
+ * the distributed-memory backends #grb::BSP1D and #grb::hybrid; see
+ * the corresponding code comment in the below algorithm for details.
+ *
+ * For the above considerations, the default for \a criticalSection is
+ * presently set to false .
+ *
+ * \parblock
+ * \par Performance semantics
+ *
+ * -# This function does not allocate nor free dynamic memory, nor shall it
+ * make any system calls.
+ *
+ * For additional performance semantics regarding work, inter-process data
+ * movement, intra-process data movement, synchronisations, and memory use,
+ * please see the specification of the ALP primitives this function relies on.
+ * These performance semantics, with the exception of getters such as
+ * #grb::nnz, are specific to the backend selected during compilation.
+ * \endparblock
+ *
+ * This algorithm is modelled after Li et al., "The K-Core Decomposition
+ * Algorithm Under the Framework of GraphBLAS", 2021 IEEE High Performance
+ * Extreme Computing Conference (HPEC), doi: 10.1109/HPEC49654.2021.9622845.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool criticalSection = false,
+ typename IOType, typename NZType
+ >
+ RC kcore_decomposition(
+ const Matrix< NZType > &A,
+ Vector< IOType > &core,
+ Vector< IOType > &distances,
+ Vector< IOType > &temp,
+ Vector< IOType > &update,
+ Vector< bool > &status,
+ IOType &k
+ ) {
+ // Add constants/expressions
+ Semiring<
+ operators::add< IOType >, operators::mul< IOType >,
+ identities::zero, identities::one
+ > ring;
+ Monoid<
+ operators::logical_or< bool >,
+ identities::logical_false
+ > lorMonoid;
+
+ // Runtime sanity checks
+ const size_t n = nrows(A);
+ {
+ // Verify that A is square
+ if( n != ncols( A )){
+ return ILLEGAL;
+ }
+ // Verify sizes of vectors
+ if( size( core ) != n ||
+ size( distances ) != n ||
+ size( temp ) != n ||
+ size( update ) != n ||
+ size( status ) != n
+ ) {
+ return MISMATCH;
+ }
+ // Verify capacity
+ if( capacity( core ) != n ||
+ capacity( distances ) != n ||
+ capacity( temp ) != n ||
+ capacity( update ) != n ||
+ capacity( status ) != n
+ ) {
+ return ILLEGAL;
+ }
+ }
+
+ // Initialise
+ IOType current_k = 0; // current coreness level
+
+ // Set initial values
+ RC ret = grb::SUCCESS;
+ ret = ret ? ret : set( temp, static_cast< IOType >( 1 ) );
+ ret = ret ? ret : set( distances, static_cast< IOType >( 0 ) );
+ ret = ret ? ret : set( core, static_cast< IOType >( 0 ) );
+ ret = ret ? ret : set( status, true );
+ ret = ret ? ret : clear( update );
+ assert( ret == SUCCESS );
+
+ ret = ret ? ret : grb::mxv< descr | descriptors::dense >(
+ distances, A, temp, ring );
+ assert( ret == SUCCESS );
+
+ if( SUCCESS != ret ) {
+ std::cerr << " Initialization of k-core decomposition failed with error "
+ << grb::toString( ret ) << "\n";
+ return ret;
+ }
+
+ size_t count = 0;
+ while( count < n && SUCCESS == ret ) {
+ bool flag = true;
+
+ // Update filter to exclude completed nodes
+ ret = ret ? ret : set( update, status, status );
+
+ while( flag ) {
+ flag = false;
+
+ // Update nodes in parallel
+ if( criticalSection ) {
+ ret = ret ? ret : clear( temp );
+ ret = ret ? ret : eWiseLambda( [ &, current_k ]( const size_t i ) {
+ if( status[ i ] && distances[ i ] <= current_k ) {
+ core[ i ] = current_k;
+ // Remove node from checking
+ status[ i ] = false;
+ // Set update
+ flag = true;
+ #pragma omp critical
+ {
+ // Add node index to update neighbours
+ setElement( temp, 1, i );
+ }
+ }
+ }, update,
+ status, distances, core, temp
+ );
+ // WARN: even with the below, this variant does not auto-parallelise in
+ // the distributed-memory sense. The reason is a performance
+ // contract violation by the above critical section -- setElement
+ // should be a collective call, but its use from within eWiseLambda
+ // does not ensure a collective call. The result is that PANIC will
+ // at some point be returned.
+ //ret = ret ? ret : collectives<>::allreduce( flag,
+ // lorMonoid.getOperator() );
+ } else {
+ ret = ret ? ret : eWiseApply( temp, status, distances, current_k,
+ operators::leq< IOType >() );
+ ret = ret ? ret : foldl( core, temp, current_k,
+ operators::right_assign< IOType >() );
+ ret = ret ? ret : foldl( status, temp, false,
+ operators::right_assign< bool >() );
+ ret = ret ? ret : foldl( flag, temp, lorMonoid );
+ ret = ret ? ret : set( update, temp, 1 );
+ if( ret == SUCCESS ) {
+ std::swap( update, temp );
+ }
+ }
+ assert( ret == SUCCESS );
+
+ if( ret == SUCCESS && flag ) {
+ ret = clear( update );
+ assert( ret == SUCCESS );
+
+ // Increase number of nodes completed
+ count += nnz( temp );
+
+ // Get the neighbours of the updated nodes
+ ret = ret ? ret : grb::mxv< descr >( update, A, temp, ring );
+ assert( ret == SUCCESS );
+
+ // Decrease distances of the neighbours
+ ret = ret ? ret : grb::eWiseApply( distances, distances, update,
+ operators::subtract< IOType >() );
+ assert( ret == SUCCESS );
+ }
+ }
+ (void) ++current_k;
+ }
+
+ if( SUCCESS != ret ){
+ std::cerr << " Excecution of k-core decomposition failed with error "
+ << grb::toString(ret) << "\n";
+ } else {
+ k = current_k;
+ }
+
+ return ret;
+ }
+
+ } // namespace algorithms
+
+} // namespace grb
+
+#endif // end _H_GRB_KCORE_DECOMPOSITION
+
diff --git a/include/graphblas/algorithms/kmeans.hpp b/include/graphblas/algorithms/kmeans.hpp
index b94b09e90..061d1e0d0 100644
--- a/include/graphblas/algorithms/kmeans.hpp
+++ b/include/graphblas/algorithms/kmeans.hpp
@@ -15,7 +15,12 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements k-means. The state of the algorithms defined within are
+ * \em experimental.
+ *
* @author Verner Vlacic
*/
@@ -40,8 +45,8 @@ namespace grb {
* @param[in,out] K k by m matrix containing the current k means as row vectors
* @param[in] X m by n matrix containing the n points to be classified as
* column vectors
- * @param[in] op coordinatewise distance operator, squared difference by
- * default
+ * @param[in] dist_op Coordinatewise distance operator, squared difference by
+ * default
*
* \todo more efficient implementation using Walker's alias method
*
@@ -52,14 +57,20 @@ namespace grb {
typename IOType = double,
class Operator = operators::square_diff< IOType, IOType, IOType >
>
- RC kpp_initialisation( Matrix< IOType > &K, const Matrix< IOType > &X,
+ RC kpp_initialisation(
+ Matrix< IOType > &K,
+ const Matrix< IOType > &X,
const Operator &dist_op = Operator()
) {
// declare monoids and semirings
Monoid< grb::operators::add< IOType >, grb::identities::zero > add_monoid;
- Monoid< grb::operators::min< IOType >, grb::identities::infinity > min_monoid;
+ Monoid<
+ grb::operators::min< IOType >,
+ grb::identities::infinity
+ > min_monoid;
Semiring<
- grb::operators::add< IOType >, grb::operators::right_assign_if< bool, IOType, IOType >,
+ grb::operators::add< IOType >,
+ grb::operators::right_assign_if< bool, IOType, IOType >,
grb::identities::zero, grb::identities::logical_true
> pattern_sum;
@@ -117,23 +128,30 @@ namespace grb {
ret = ret ? ret : grb::setElement( col_select, true, i );
- ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >( selected, col_select, X, pattern_sum );
+ ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >(
+ selected, col_select, X, pattern_sum );
- ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid, dist_op );
+ ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid,
+ dist_op );
- ret = ret ? ret : grb::foldl( min_distances, selected_distances, min_monoid );
+ ret = ret ? ret : grb::foldl( min_distances, selected_distances,
+ min_monoid );
- // TODO the remaining part of the loop should be replaced with the alias algorithm
+ // TODO the remaining part of the loop should be replaced with the alias
+ // algorithm
IOType range = add_monoid.template getIdentity< IOType >();
ret = ret ? ret : grb::foldl( range, min_distances, add_monoid );
double sample = -1;
if( ret == SUCCESS ) {
- const size_t seed = std::chrono::system_clock::now().time_since_epoch().count();
- std::default_random_engine generator( seed );
- std::uniform_real_distribution< double > uniform( 0, 1 );
- sample = uniform( generator );
+ {
+ const size_t seed =
+ std::chrono::system_clock::now().time_since_epoch().count();
+ std::default_random_engine generator( seed );
+ std::uniform_real_distribution< double > uniform( 0, 1 );
+ sample = uniform( generator );
+ }
ret = grb::collectives<>::broadcast( sample, 0 );
}
assert( sample >= 0 );
@@ -152,7 +170,8 @@ namespace grb {
}
}
- // create the matrix K by selecting the columns of X indexed by selected_indices
+ // create the matrix K by selecting the columns of X indexed by
+ // selected_indices
// declare pattern matrix
Matrix< void > M( k, n );
@@ -164,7 +183,8 @@ namespace grb {
return std::make_pair( ind, val );
}
);
- ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(), PARALLEL );
+ ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(),
+ PARALLEL );
}
ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
@@ -182,23 +202,27 @@ namespace grb {
/**
* The kmeans iteration given an initialisation
*
- * @param[in,out] K k by m matrix containing the current k means as row vectors
+ * @param[in,out] K k by m matrix containing the current k means as row
+ * vectors
* @param[in] clusters_and_distances Vector containing the class and distance
* to centroid for each point
* @param[in] X m by n matrix containing the n points to be classified as
* column vectors
* @param[in] max_iter Maximum number of iterations
- * @param[in] op Coordinatewise distance operator, squared difference by
- * default
+ * @param[in] dist_op Coordinatewise distance operator, squared difference by
+ * default
*
+ * \internal
* \todo expand documentation
+ * \endeinternal
*/
template<
Descriptor descr = descriptors::no_operation,
typename IOType = double,
class Operator = operators::square_diff< IOType, IOType, IOType >
>
- RC kmeans_iteration( Matrix< IOType > &K,
+ RC kmeans_iteration(
+ Matrix< IOType > &K,
Vector< std::pair< size_t, IOType > > &clusters_and_distances,
const Matrix< IOType > &X,
const size_t max_iter = 1000,
@@ -221,16 +245,19 @@ namespace grb {
> comparison_monoid;
Semiring<
- grb::operators::add< IOType >, grb::operators::right_assign_if< bool, IOType, IOType >,
+ grb::operators::add< IOType >,
+ grb::operators::right_assign_if< bool, IOType, IOType >,
grb::identities::zero, grb::identities::logical_true
> pattern_sum;
Semiring<
- grb::operators::add< size_t >, grb::operators::right_assign_if< size_t, size_t, size_t >,
+ grb::operators::add< size_t >,
+ grb::operators::right_assign_if< size_t, size_t, size_t >,
grb::identities::zero, grb::identities::logical_true
> pattern_count;
- // runtime sanity checks: the row dimension of X should match the column dimension of K
+ // runtime sanity checks: the row dimension of X should match the column
+ // dimension of K
if( ncols( K ) != nrows( X ) ) {
return MISMATCH;
}
@@ -274,12 +301,12 @@ namespace grb {
bool converged;
do {
- ++iter;
+ (void) ++iter;
- ret = ret ? ret : grb::set( clusters_and_distances_prev, clusters_and_distances );
+ ret = ret ? ret : grb::set( clusters_and_distances_prev,
+ clusters_and_distances );
- ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op,
- RESIZE );
+ ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op, RESIZE );
ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op );
ret = ret ? ret : vxm( clusters_and_distances, labels, Dist, argmin_monoid,
@@ -287,15 +314,15 @@ namespace grb {
auto converter = grb::utils::makeVectorToMatrixConverter<
void, indexIOType
- >(
+ > (
clusters_and_distances,
- []( const size_t & ind, const indexIOType & pair ) {
+ []( const size_t &ind, const indexIOType &pair ) {
return std::make_pair( pair.first, ind );
}
);
- ret = ret ? ret : grb::buildMatrixUnique( M,
- converter.begin(), converter.end(), PARALLEL );
+ ret = ret ? ret : grb::buildMatrixUnique( M, converter.begin(),
+ converter.end(), PARALLEL );
ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
pattern_sum, RESIZE );
diff --git a/include/graphblas/algorithms/knn.hpp b/include/graphblas/algorithms/knn.hpp
index fdffd5ba3..6df7d2fc7 100644
--- a/include/graphblas/algorithms/knn.hpp
+++ b/include/graphblas/algorithms/knn.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the \f$ k \f$-hop nearest neighbours from a given source vertex.
+ *
* @author A. N. Yzelman
* @date: 27th of April, 2017
*/
@@ -27,6 +31,7 @@
#include
+
namespace grb {
namespace algorithms {
@@ -48,7 +53,6 @@ namespace grb {
* This algorithm requires the following workspace:
*
* @param[in,out] buf1 A buffer vector. Must match the size of \a A.
- * @param[in,out] buf2 A buffer vector. Must match the size of \a A.
*
* For \f$ n \times n \f$ matrices \a A, the capacity of \a u, \a buf1, and
* \a buf2 must equal \f$ n \f$.
diff --git a/include/graphblas/algorithms/label.hpp b/include/graphblas/algorithms/label.hpp
index cfebc824f..a58ebad9c 100644
--- a/include/graphblas/algorithms/label.hpp
+++ b/include/graphblas/algorithms/label.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements label propagation.
+ *
* @author J. M. Nash
* @date 21st of March, 2017
*/
@@ -116,10 +120,11 @@ namespace grb {
* accelerating the PageRank computation', ACM Press, 2003.
*/
template< typename IOType >
- RC label( Vector< IOType > &out,
+ RC label(
+ Vector< IOType > &out,
const Vector< IOType > &y, const Matrix< IOType > &W,
const size_t n, const size_t l,
- const size_t MaxIterations = 1000
+ const size_t maxIterations = 1000
) {
// label propagation vectors and matrices operate over the real domain
Semiring<
@@ -198,7 +203,7 @@ namespace grb {
// compute f as P*f
// main loop completes when function f is stable
size_t iter = 1;
- while( ret == SUCCESS && different && iter < MaxIterations ) {
+ while( ret == SUCCESS && different && iter < maxIterations ) {
#ifdef _DEBUG
if( n < MaxAnyPrinting ) {
@@ -230,7 +235,12 @@ namespace grb {
<< "nnz( mask ) = " << nnz( mask ) << "\n";
#endif
// clamps the first l labelled nodes
- ret = ret ? ret : set( fNext, mask, f );
+ ret = ret ? ret : foldl(
+ fNext, mask,
+ f,
+ grb::operators::right_assign< IOType >()
+ );
+ assert( ret == SUCCESS );
#ifdef _DEBUG
std::cerr << "\t post-set nnz( fNext ) = " << nnz( fNext ) << "\n";
printVector(
@@ -246,31 +256,36 @@ namespace grb {
#ifdef _DEBUG
std::cerr << "\t pre-set nnz(f) = " << nnz( f ) << "\n";
#endif
- ret = ret ? ret : set( f, fNext );
+ std::swap( f, fNext );
#ifdef _DEBUG
std::cerr << "\t post-set nnz(f) = " << nnz( f ) << "\n";
#endif
// go to next iteration
- (void)++iter;
+ (void) ++iter;
}
if( ret == SUCCESS ) {
if( different ) {
if( s == 0 ) {
- std::cout << "Warning: label propagation did not converge after "
+ std::cerr << "Info: label propagation did not converge after "
<< (iter-1) << " iterations\n";
}
return FAILED;
} else {
if( s == 0 ) {
- std::cout << "Info: label propagation converged in "
+ std::cerr << "Info: label propagation converged in "
<< (iter-1) << " iterations\n";
}
- return set( out, f );
+ std::swap( out, f );
+ return SUCCESS;
}
}
// done
+ if( s == 0 ) {
+ std::cerr << "Warning: label propagation exiting with " << toString(ret)
+ << "\n";
+ }
return ret;
}
diff --git a/include/graphblas/algorithms/mpv.hpp b/include/graphblas/algorithms/mpv.hpp
index 22bde7cf2..78ae3a1db 100644
--- a/include/graphblas/algorithms/mpv.hpp
+++ b/include/graphblas/algorithms/mpv.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the matrix powers kernel \f$ y=A^kx \f$ over arbitrary semirings.
+ *
* @author A. N. Yzelman
* @date 30th of March 2017
*/
@@ -46,6 +50,7 @@ namespace grb {
* supplied vector must match the row dimension size of \a A.
* @param[in] A The square input matrix A. The supplied matrix must match
* the dimensions of \a u and \a v.
+ * @param[in] k How many matrix--vector multiplications are requested.
* @param[in] v The input vector v. The supplied vector must match the
* column dimension size of \a A. It may not be the same
* vector as \a u.
@@ -144,7 +149,7 @@ namespace grb {
ret = mxv< descr >( temp, A, u, ring );
// check if this was the final multiplication
assert( iterate <= k );
- if( iterate == k || ret != SUCCESS ) {
+ if( iterate + 1 == k || ret != SUCCESS ) {
break;
}
// multiply with output into u
diff --git a/include/graphblas/algorithms/norm.hpp b/include/graphblas/algorithms/norm.hpp
index 33f2df836..c74ff910a 100644
--- a/include/graphblas/algorithms/norm.hpp
+++ b/include/graphblas/algorithms/norm.hpp
@@ -15,14 +15,20 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the 2-norm.
+ *
* @author A. N. Yzelman
* @date 17th of March 2022
*
+ * \internal
* Factored out of graphblas/blas1.hpp, promoted to a (simple) algorithm since
* semiring structures are insufficient to capture sqrt .
*
* \todo Provide implementations of other norms.
+ * \endinternal
*/
#ifndef _H_GRB_ALGORITHMS_NORM
diff --git a/include/graphblas/algorithms/pregel_connected_components.hpp b/include/graphblas/algorithms/pregel_connected_components.hpp
new file mode 100644
index 000000000..8d134bd89
--- /dev/null
+++ b/include/graphblas/algorithms/pregel_connected_components.hpp
@@ -0,0 +1,178 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the (strongly) connected components algorithm over undirected
+ * graphs using the ALP/Pregel interface.
+ *
+ * @author: A. N. Yzelman.
+ */
+
+#ifndef _H_GRB_PREGEL_CONNECTEDCOMPONENTS
+#define _H_GRB_PREGEL_CONNECTEDCOMPONENTS
+
+#include
+
+
+namespace grb {
+
+ namespace algorithms {
+
+ namespace pregel {
+
+ /**
+ * A vertex-centric Connected Components algorithm.
+ *
+ * @tparam VertexIDType A type large enough to assign an ID to each vertex
+ * in the graph the algorithm is to run on.
+ *
+ * \ingroup Pregel
+ */
+ template< typename VertexIDType >
+ struct ConnectedComponents {
+
+ /**
+ * This vertex-centric Connected Components algorithm does not require any
+ * algorithm parameters.
+ */
+ struct Data {};
+
+ /**
+ * The vertex-centric program for computing connected components. On
+ * termination, the number of individual IDs in \a current_max_ID signifies
+ * the number of components, while the value at each entry signifies which
+ * component the vertex corresponds to.
+ *
+ * @param[in,out] current_max_ID On input: each entry is set to an unique
+ * ID, corresponding to a unique ID for each
+ * vertex. On output: the ID of the component
+ * the corresponding vertex belongs to.
+ * @param[in] incoming_message A buffer for incoming messages to a vertex
+ * program.
+ * @param[in] outgoing_message A buffer for outgoing messages to a vertex
+ * program.
+ * @param[in] parameters Global algorithm parameters, currently an
+ * instance of an empty struct (no
+ * parameters).
+ * @param[in,out] pregel The Pregel state the program may refer to.
+ *
+ * This program 1) broadcasts its current ID to its neighbours, 2) checks
+ * if any received IDs are larger than the current ID, then 3a) if not,
+ * votes to halt; 3b) if yes, replaces the current ID with the received
+ * maximum. It is meant to be executed using a max monoid as message
+ * aggregator.
+ */
+ static void program(
+ VertexIDType ¤t_max_ID,
+ const VertexIDType &incoming_message,
+ VertexIDType &outgoing_message,
+ const Data ¶meters,
+ grb::interfaces::PregelState &pregel
+ ) {
+ (void) parameters;
+ if( pregel.round > 0 ) {
+ if( pregel.indegree == 0 ) {
+ pregel.voteToHalt = true;
+ } else if( current_max_ID < incoming_message ) {
+ current_max_ID = incoming_message;
+ } else {
+ pregel.voteToHalt = true;
+ }
+ }
+ if( pregel.outdegree > 0 ) {
+ outgoing_message = current_max_ID;
+ } else {
+ pregel.voteToHalt = true;
+ }
+ }
+
+ /**
+ * A convenience function that, given a Pregel instance, executes the
+ * #program.
+ *
+ * @param[in,out] pregel A Pregel instance over which to execute the
+ * program.
+ * @param[out] group_ids The ID of the component the corresponding vertex
+ * belongs to.
+ * @param[in] max_steps A maximum number of rounds the program is allowed
+ * to run. If \a 0, no maximum number of rounds will
+ * be in effect.
+ *
+ * On succesful termination, the number of rounds is optionally written
+ * out:
+
+ * @param[out] steps_taken A pointer to where the number of rounds should
+ * be recorded. Will not be used if equal to
+ * nullptr .
+ */
+ template< typename PregelType >
+ static grb::RC execute(
+ grb::interfaces::Pregel< PregelType > &pregel,
+ grb::Vector< VertexIDType > &group_ids,
+ const size_t max_steps = 0,
+ size_t * const steps_taken = nullptr
+ ) {
+ const size_t n = pregel.num_vertices();
+ if( grb::size( group_ids ) != n ) {
+ return MISMATCH;
+ }
+
+ grb::RC ret = grb::set< grb::descriptors::use_index >( group_ids, 1 );
+ if( ret != SUCCESS ) {
+ return ret;
+ }
+
+ grb::Vector< VertexIDType > in( n );
+ grb::Vector< VertexIDType > out( n );
+ grb::Vector< VertexIDType > out_buffer = interfaces::config::out_sparsify
+ ? grb::Vector< VertexIDType >( n )
+ : grb::Vector< VertexIDType >( 0 );
+
+ size_t steps;
+
+ ret = pregel.template execute<
+ grb::operators::max< VertexIDType >,
+ grb::identities::negative_infinity
+ > (
+ program,
+ group_ids,
+ Data(),
+ in, out,
+ steps,
+ out_buffer,
+ max_steps
+ );
+
+ if( ret == grb::SUCCESS && steps_taken != nullptr ) {
+ *steps_taken = steps;
+ }
+
+ return ret;
+ }
+
+ };
+
+ } //end namespace `grb::algorithms::pregel'
+
+ } // end namespace ``grb::algorithms''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/algorithms/pregel_pagerank.hpp b/include/graphblas/algorithms/pregel_pagerank.hpp
new file mode 100644
index 000000000..5064f7f24
--- /dev/null
+++ b/include/graphblas/algorithms/pregel_pagerank.hpp
@@ -0,0 +1,224 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements a traditional vertex-centric page ranking algorithm using
+ * ALP/Pregel.
+ *
+ * @author A. N. Yzelman
+ */
+
+#ifndef _H_GRB_PREGEL_PAGERANK
+#define _H_GRB_PREGEL_PAGERANK
+
+#include
+
+
+namespace grb {
+
+ namespace algorithms {
+
+ namespace pregel {
+
+ /**
+ * A Pregel-style PageRank-like algorithm.
+ *
+ * This vertex-centric program does not correspond to the canonical PageRank
+ * algorithm by Brin and Page. In particular, it misses corrections for
+ * dangling nodes and does not perform convergence checks in any norm.
+ *
+ * @tparam IOType The type of the PageRank scores (e.g., double ).
+ * @tparam localConverge Whether vertices become inactive once their local
+ * scores have converged, or whether to terminate only
+ * when all vertices have converged.
+ *
+ * \ingroup Pregel
+ */
+ template< typename IOType, bool localConverge >
+ struct PageRank {
+
+ /**
+ * The algorithm parameters.
+ */
+ struct Data {
+
+ /**
+ * The probability of jumping to a random page instead of a linked page.
+ */
+ IOType alpha = 0.15;
+
+ /**
+ * The local convergence criterion.
+ */
+ IOType tolerance = 0.00001;
+
+ };
+
+ /**
+ * The vertex-centric PageRank-like program.
+ *
+ * @param[out] current_score The current rank corresponding to this
+ * vertex.
+ * @param[in] incoming_message Neighbour contributions to our score.
+ * @param[out] outgoing_message The score contribution to send to our
+ * neighbours.
+ * @param[in] parameters The algorithm parameters.
+ * @param[in,out] pregel The state of the Pregel interface.
+ *
+ * The Pregel program expects incoming messages to be aggregated using a
+ * plus monoid over elements of \a IOType.
+ */
+ static void program(
+ IOType ¤t_score,
+ const IOType &incoming_message,
+ IOType &outgoing_message,
+ const Data ¶meters,
+ grb::interfaces::PregelState &pregel
+ ) {
+ // initialise
+ if( pregel.round == 0 ) {
+ current_score = static_cast< IOType >( 1 );
+ }
+
+#ifdef _DEBUG
+ // when in debug mode, probably one does not wish to track the state of
+ // each vertex individually, hence we include a simple guard by default:
+ const bool dbg = pregel.vertexID == 0;
+ if( dbg ) {
+ std::cout << "ID: " << pregel.vertexID << "\n"
+ << "\t active: " << pregel.active << "\n"
+ << "\t round: " << pregel.round << "\n"
+ << "\t previous score: " << current_score << "\n"
+ << "\t incoming message: " << incoming_message << "\n";
+ }
+#endif
+
+ // compute
+ if( pregel.round > 0 ) {
+ const IOType old_score = current_score;
+ current_score = parameters.alpha +
+ (static_cast< IOType >(1) - parameters.alpha) * incoming_message;
+ if( fabs(current_score-old_score) < parameters.tolerance ) {
+#ifdef _DEBUG
+ std::cout << "\t\t vertex " << pregel.vertexID << " converged\n";
+#endif
+ if( localConverge ) {
+ pregel.active = false;
+ } else {
+ pregel.voteToHalt = true;
+ }
+ }
+ }
+
+ // broadcast
+ if( pregel.outdegree > 0 ) {
+ outgoing_message =
+ current_score /
+ static_cast< IOType >(pregel.outdegree);
+ }
+
+#ifdef _DEBUG
+ if( dbg ) {
+ std::cout << "\t current score: " << current_score << "\n"
+ << "\t voteToHalt: " << pregel.voteToHalt << "\n"
+ << "\t outgoing message: " << outgoing_message << "\n";
+ }
+#endif
+
+ }
+
+ /**
+ * A convenience function for launching a PageRank algorithm over a given
+ * Pregel instance.
+ *
+ * @tparam PregelType The nonzero type of an edge in the Pregel instance.
+ *
+ * This convenience function materialises the buffers expected to be passed
+ * into the Pregel instance, and selects the expected monoid for executing
+ * this program.
+ *
+ * \warning In performance-critical code, one may want to pre-allocate the
+ * buffers instead of having this convenience function allocate
+ * those. In such cases, please call manually the Pregel execute
+ * function, i.e., #grb::interfaces::Pregel< PregelType >::execute.
+ *
+ * The following arguments are mandatory:
+ *
+ * @param[in] pregel The Pregel instance that this program should
+ * execute on.
+ * @param[out] scores A vector that corresponds to the scores
+ * corresponding to each vertex. It must be of size
+ * equal to the number of vertices \f$ n \f$ in the
+ * \a pregel instance, and must have \f$ n \f$
+ * capacity \em and values. The initial contents are
+ * ignored by this algorithm.
+ * @param[out] steps_taken How many rounds the program took until
+ * termination.
+ *
+ * The following arguments are optional:
+ *
+ * @param[in] parameters The algorithm parameters. If not given, default
+ * values will be substituted.
+ * @param[in] max_steps The maximum number of rounds this program may take.
+ * If not given, the number of rounds will be
+ * unlimited.
+ */
+ template< typename PregelType >
+ static grb::RC execute(
+ grb::interfaces::Pregel< PregelType > &pregel,
+ grb::Vector< IOType > &scores,
+ size_t &steps_taken,
+ const Data ¶meters = Data(),
+ const size_t max_steps = 0
+ ) {
+ const size_t n = pregel.num_vertices();
+ if( grb::size( scores ) != n ) {
+ return MISMATCH;
+ }
+
+ grb::Vector< IOType > in( n );
+ grb::Vector< IOType > out( n );
+ grb::Vector< IOType > out_buffer = interfaces::config::out_sparsify
+ ? grb::Vector< IOType >( n )
+ : grb::Vector< IOType >( 0 );
+
+ return pregel.template execute<
+ grb::operators::add< IOType >,
+ grb::identities::zero
+ > (
+ program,
+ scores,
+ parameters,
+ in, out,
+ steps_taken,
+ out_buffer,
+ max_steps
+ );
+ }
+
+ };
+
+ } //end namespace `grb::algorithms::pregel'
+
+ } // end namespace ``grb::algorithms''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/algorithms/simple_pagerank.hpp b/include/graphblas/algorithms/simple_pagerank.hpp
index c1b9243e8..268e088f3 100644
--- a/include/graphblas/algorithms/simple_pagerank.hpp
+++ b/include/graphblas/algorithms/simple_pagerank.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements the canonical PageRank algorithm by Brin and Page.
+ *
* @author A. N. Yzelman
* @date: 21st of March, 2017
*/
@@ -29,6 +33,7 @@
#include
#endif
+
namespace grb {
namespace algorithms {
@@ -86,7 +91,7 @@ namespace grb {
* @param[out] iterations If not nullptr , the number of iterations
* the call to this algorithm took will be written to
* the location pointed to.
- * @param[out] quality If not nullptr,/tt>, the last computed residual
+ * @param[out] quality If not nullptr , the last computed residual
* will be written to the location pointed to.
*
* @returns #grb::SUCCESS If the computation converged within \a max
diff --git a/include/graphblas/algorithms/sparse_nn_single_inference.hpp b/include/graphblas/algorithms/sparse_nn_single_inference.hpp
index 64d132982..ff9b11f31 100644
--- a/include/graphblas/algorithms/sparse_nn_single_inference.hpp
+++ b/include/graphblas/algorithms/sparse_nn_single_inference.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Implements (non-batched) sparse neural network inference.
+ *
* @author Aristeidis Mastoras
*/
@@ -25,6 +29,7 @@
#include
#include
+
namespace grb {
namespace algorithms {
@@ -216,8 +221,6 @@ namespace grb {
* inference proceeds:
*
* @param[in] relu The non-linear ReLU function to apply element-wise.
- * @param[in] min Operator used for thresholding. Maximum feature value
- * is hard-coded to 32, as per the GraphChallenge.
* @param[in] ring The semiring under which to perform the inference.
*
* The default algebraic structures are standard \a relu (i.e., max), \a min
diff --git a/include/graphblas/algorithms/spy.hpp b/include/graphblas/algorithms/spy.hpp
index 6bc6d5939..413622780 100644
--- a/include/graphblas/algorithms/spy.hpp
+++ b/include/graphblas/algorithms/spy.hpp
@@ -15,11 +15,19 @@
* limitations under the License.
*/
+/**
+ * @file
+ *
+ * Implements a simple matrix spy algorithm.
+ *
+ * @author A. N. Yzelman
+ */
#ifndef _H_GRB_ALGORITHMS_SPY
#define _H_GRB_ALGORITHMS_SPY
#include
+#include
#include
diff --git a/include/graphblas/backends.hpp b/include/graphblas/backends.hpp
index 72f1bcec1..3fd2f0ec1 100644
--- a/include/graphblas/backends.hpp
+++ b/include/graphblas/backends.hpp
@@ -16,140 +16,203 @@
*/
/**
+ * @file
+ *
+ * This file contains a register of all backends that are either implemented,
+ * under implementation, or conceived and recorded for future consideration to
+ * implement.
+ *
* @author: A. N. Yzelman
* @date 21st of December, 2016
- *
- * @file This file contains a register of all backends that are either
- * implemented, under implementation, or were at any point in time
- * conceived and noteworthy enough to be recorded for future
- * consideration to implement. It does so via the grb::Backend
- * enum.
*/
#ifndef _H_GRB_BACKENDS
#define _H_GRB_BACKENDS
+
namespace grb {
/**
- * This enum collects all implemented backends. Depending on compile flags,
- * some of these options may be disabled.
+ * A collection of all backends. Depending on which dependences were
+ * configured during the bootstrapping of this ALP installation, some of these
+ * backends may be disabled.
+ *
+ * \internal
+ * The collection includes backend identifiers that are for internal use only.
+ * \endinternal
+ *
+ * \ingroup backends
*/
enum Backend {
/**
* The sequential reference implementation. Supports fast operations with
- * both sparse and dense vectors.
+ * both sparse and dense vectors, and employs auto-vectorisation.
*/
reference,
/**
* The threaded reference implementation. Supports fast operations with both
- * sparse and dense vectors.
+ * sparse and dense vectors. Employs OpenMP used with a mixture of fork/join
+ * and SPMD programming styles.
*/
reference_omp,
/**
- * A shared-memory parallel distribution based on a row-wise 1D data
- * distribution using shared vector data.
+ * A backend that automatically extracts hyperDAGs from user computations. It
+ * only captures metadata for recording the hyperDAG, and relies on another
+ * backend to actually execute the requested computations-- by default, this
+ * is the #reference backend.
+ */
+ hyperdags,
+
+ /**
+ * The threaded nonblocking implementation. Supports fast operations with both
+ * sparse and dense vectors. This backend is currently under development.
+ */
+ nonblocking,
+
+ /**
+ * \internal
+ * A shared-memory parallel distribution based on a row-wise 1D block-cyclic
+ * data distribution using shared vector data.
+ * \endinternal
*/
shmem1D,
/**
+ * \internal
* Like shmem1D, but using interleaved vector allocation. Useful for multi-
* socket single-node targets. From experience, this is a good choice for up
* to four sockets-- after which BSP2D becomes preferred.
+ * \endinternal
*/
NUMA1D,
/**
- * A superclass of all BSP-based implementations.
+ * \internal
+ * A superclass of all LPF-based implementations. Not a "real" (selectable)
+ * backend.
+ * \endinternal
*/
GENERIC_BSP,
/**
* A parallel implementation based on a row-wise 1D data distribution,
- * implemented using PlatformBSP.
+ * implemented using LPF.
+ *
+ * This backend manages multiple user processes, manages data distributions
+ * of containers between those user processes, and decomposes primitives into
+ * local compute phases with intermittent communications. For local compute
+ * phases it composes with a single user process backend, #reference by
+ * default.
*/
BSP1D,
/**
+ * \internal
* Like BSP1D, but stores each matrix twice. Combined with the normal
* reference implementation, this actually stores all matrices four times
* This implementation is useful for maximum performance, at the cost of
* the additional memory usage.
+ * \endinternal
*/
doublyBSP1D,
/**
+ * \internal
* A parallel implementation based on a block-cyclic 2D data distribution,
* implemented using PlatformBSP. This implementation will likely outperform
* BSP1D and doublyBSP1D as the number of nodes involved in the computation
* increases with the problem sizes.
+ * \endinternal
*/
BSP2D,
/**
+ * \internal
* Like BSP2D, but automatically improves the distribution while executing
* user code-- while initial computations are slowed down, the user
* application will speed up as this GraphBLAS implementation infers more
* information about the best data distribution.
* When enough statistics are gathered, data is redistributed and all future
* operations execute much faster than with BSP2D alone.
+ * \endinternal
*/
autoBSP,
/**
+ * \internal
* Like autoBSP, except that the best distribution is precomputed whenever a
* matrix is read in. This pre-processing step is very expensive. Use autoBSP
* when unsure if the costs of a full preprocessing stage is worth it.
+ * \endinternal
*/
optBSP,
/**
- * A hybrid that uses shmem1D within each socket and BSP1D between sockets.
+ * A composed backend that uses #reference_omp within each user process and
+ * #BSP1D between sockets.
+ *
+ * This backend is implemented using the #BSP1D code, with the process-local
+ * backend overridden from #reference to #reference_omp.
+ */
+ hybrid,
+
+ /**
+ * \internal
+ * A hybrid that uses #shmem1D within each socket and #BSP1D between sockets.
* Recommended for a limited number of sockets and a limited amount of nodes,
* i.e., for a small cluster.
+ * \endinternal
*/
hybridSmall,
/**
- * A hybrid that uses numa1D within each socket and BSP1D between sockets.
+ * \internal
+ * A hybrid that uses #numa1D within each socket and #BSP1D between sockets.
* Recommended for a limited number of nodes with up to two sockets each.
*
- * This variant is expected to perform better than hybrid1D for middle-sized
- * clusters.
+ * This variant is expected to perform better than #hybridSmall for
+ * middle-sized clusters.
+ * \endinternal
*/
hybridMid,
/**
- * A hybrid that uses numa1D within each socket and autoBSP between sockets.
+ * \internal
+ * A hybrid that uses #numa1D within each socket and #autoBSP between sockets.
* Recommended for a large number of nodes with up to two sockets each.
*
- * This variant is expected to perform better than hybridSmall and hybridMid
+ * This variant is expected to perform better than #hybridSmall and #hybridMid
* for larger clusters.
*
* If there are many nodes each with many sockets (four or more) each, then
* the use of flat (non-hybrid) #BSP2D or #autoBSP is recommended instead.
+ * \endinternal
*/
hybridLarge,
/**
+ * \internal
* A hybrid variant that is optimised for a minimal memory footprint.
+ * \endinternal
*/
minFootprint,
/**
- * A variant for RISC-V processors.
- *
- * Collaboration with ETH Zurich (ongoing).
+ * A variant for Snitch RISC-V cores. It is based on an older #reference
+ * backend.
*/
banshee,
/**
- * A variant for RISC-V processors with (I)SSR extensions
+ * \internal
+ * A variant for RISC-V processors with (I)SSR extensions.
*
- * Collaboration with ETH Zurich (ongoing).
+ * \note This backend is used internally by the #banshee backend; it is not
+ * selectable.
+ * \endinternal
*/
banshee_ssr
@@ -158,3 +221,4 @@ namespace grb {
} // namespace grb
#endif
+
diff --git a/include/graphblas/banshee/config.hpp b/include/graphblas/banshee/config.hpp
index b70b0b9d2..c4a9a8baf 100644
--- a/include/graphblas/banshee/config.hpp
+++ b/include/graphblas/banshee/config.hpp
@@ -29,6 +29,7 @@
#include
+
namespace grb {
/**
@@ -46,3 +47,4 @@ namespace grb {
} // namespace grb
#endif // end ``_H_GRB_BANSHEE_CONFIG''
+
diff --git a/include/graphblas/base/benchmark.hpp b/include/graphblas/base/benchmark.hpp
index 74c666ec7..56a2fade6 100644
--- a/include/graphblas/base/benchmark.hpp
+++ b/include/graphblas/base/benchmark.hpp
@@ -15,7 +15,12 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * This file contains a variant on the #grb::Launcher specialised for
+ * benchmarks.
+ *
* @author J. W. Nash & A. N. Yzelman
* @date 17th of April, 2017
*/
@@ -39,298 +44,557 @@
#include "exec.hpp"
#ifndef _GRB_NO_STDIO
-#include
+ #include
#endif
#ifndef _GRB_NO_EXCEPTIONS
-#include
+ #include
#endif
#include
+
+/**
+ * \defgroup benchmarking Benchmarking
+ *
+ * ALP has a specialised class for benchmarking ALP programs, grb::Benchmarker,
+ * which is a variant on the #grb::Launcher. It codes a particular benchmarking
+ * strategy of any given ALP program as described below.
+ *
+ * The program is called \a inner times \a outer times. Between every
+ * \a inner repetitions there is a one-second sleep that ensures machine
+ * variability is taken into account. Several statistics are measured
+ * across the \a outer repetitions: the minimum, maximum, average, and the
+ * (unbiased) sample standard deviation. By contrast, for the \a inner
+ * repetitions, only an average is computed -- the function of \a inner
+ * repetitions is solely to avoid timing programs that execute in too short
+ * a time frame, meaning a time frame that is of a similar order as the time
+ * it takes to actually call the system timer functionalities.
+ *
+ * \note As a result, \a inner should always equal \em one when benchmarking
+ * any non-trivial ALP program, while for benchmarking ALP kernels on
+ * small data \a inner may be taken (much) larger.
+ *
+ * \note In published experiments, \a inner is chosen such that a single
+ * outer repetition takes 10 to 100 milliseconds.
+ */
+
namespace grb {
namespace internal {
+ /**
+ * The common functionalities used by all #grb::Benchmarker classes.
+ *
+ * \ingroup benchmarking
+ */
class BenchmarkerBase {
- protected:
+ protected:
+
#ifndef _GRB_NO_STDIO
- /** \todo TODO add documentation. */
- static void printTimeSinceEpoch( const bool printHeader = true ) {
- const auto now = std::chrono::system_clock::now();
- const auto since = now.time_since_epoch();
- if( printHeader ) {
- std::cout << "Time since epoch (in ms.): ";
+ /**
+ * A helper function that prints the time elapsed sinc epoch.
+ *
+ * @param[in] printHeader An optional Boolean parameter with default value
+ * true . If set, this function will append
+ * a human-readable header before outputting the
+ * time-since-epoch.
+ */
+ static void printTimeSinceEpoch( const bool printHeader = true ) {
+ const auto now = std::chrono::system_clock::now();
+ const auto since = now.time_since_epoch();
+ if( printHeader ) {
+ std::cout << "Time since epoch (in ms.): ";
+ }
+ std::cout << std::chrono::duration_cast<
+ std::chrono::milliseconds
+ >( since ).count() << "\n";
}
- std::cout << std::chrono::duration_cast< std::chrono::milliseconds >( since ).count() << "\n";
- }
#endif
- // calculate inner loop performance stats
- static void benchmark_calc_inner( const size_t loop,
- const size_t total,
- grb::utils::TimerResults & inner_times,
- grb::utils::TimerResults & total_times,
- grb::utils::TimerResults & min_times,
- grb::utils::TimerResults & max_times,
- grb::utils::TimerResults * sdev_times ) {
- inner_times.normalize( total );
- total_times.accum( inner_times );
- min_times.min( inner_times );
- max_times.max( inner_times );
- sdev_times[ loop ] = inner_times;
- }
-
- // calculate outer loop performance stats
- static void benchmark_calc_outer( const size_t total,
- grb::utils::TimerResults & total_times,
- grb::utils::TimerResults & min_times,
- grb::utils::TimerResults & max_times,
- grb::utils::TimerResults * sdev_times,
- const size_t pid ) {
- total_times.normalize( total );
- grb::utils::TimerResults sdev;
- // compute standard dev of average times, leaving sqrt calculation until the output of the values
- sdev.set( 0 );
- for( size_t i = 0; i < total; i++ ) {
- double diff = sdev_times[ i ].io - total_times.io;
- sdev.io += diff * diff;
- diff = sdev_times[ i ].preamble - total_times.preamble;
- sdev.preamble += diff * diff;
- diff = sdev_times[ i ].useful - total_times.useful;
- sdev.useful += diff * diff;
- diff = sdev_times[ i ].postamble - total_times.postamble;
- sdev.postamble += diff * diff;
+ /**
+ * Calculate inner loop performance stats
+ */
+ static void benchmark_calc_inner(
+ const size_t loop,
+ const size_t total,
+ grb::utils::TimerResults &inner_times,
+ grb::utils::TimerResults &total_times,
+ grb::utils::TimerResults &min_times,
+ grb::utils::TimerResults &max_times,
+ grb::utils::TimerResults * sdev_times
+ ) {
+ inner_times.normalize( total );
+ total_times.accum( inner_times );
+ min_times.min( inner_times );
+ max_times.max( inner_times );
+ sdev_times[ loop ] = inner_times;
}
- // unbiased normalisation of the standard deviation
- sdev.normalize( total - 1 );
+
+ /**
+ * Calculate outer loop performance stats
+ */
+ static void benchmark_calc_outer(
+ const size_t total,
+ grb::utils::TimerResults &total_times,
+ grb::utils::TimerResults &min_times,
+ grb::utils::TimerResults &max_times,
+ grb::utils::TimerResults * sdev_times,
+ const size_t pid
+ ) {
+ total_times.normalize( total );
+ grb::utils::TimerResults sdev;
+ // compute standard dev of average times, leaving sqrt calculation until
+ // the output of the values
+ sdev.set( 0 );
+ for( size_t i = 0; i < total; i++ ) {
+ double diff = sdev_times[ i ].io - total_times.io;
+ sdev.io += diff * diff;
+ diff = sdev_times[ i ].preamble - total_times.preamble;
+ sdev.preamble += diff * diff;
+ diff = sdev_times[ i ].useful - total_times.useful;
+ sdev.useful += diff * diff;
+ diff = sdev_times[ i ].postamble - total_times.postamble;
+ sdev.postamble += diff * diff;
+ }
+ // unbiased normalisation of the standard deviation
+ sdev.normalize( total - 1 );
#ifndef _GRB_NO_STDIO
- // output results
- if( pid == 0 ) {
- std::cout << "Overall timings (io, preamble, useful, "
- "postamble):\n"
- << std::scientific;
- std::cout << "Avg: " << total_times.io << ", " << total_times.preamble << ", " << total_times.useful << ", " << total_times.postamble << "\n";
- std::cout << "Min: " << min_times.io << ", " << min_times.preamble << ", " << min_times.useful << ", " << min_times.postamble << "\n";
- std::cout << "Max: " << max_times.io << ", " << max_times.preamble << ", " << max_times.useful << ", " << max_times.postamble << "\n";
- std::cout << "Std: " << sqrt( sdev.io ) << ", " << sqrt( sdev.preamble ) << ", " << sqrt( sdev.useful ) << ", " << sqrt( sdev.postamble ) << "\n";
-#if __GNUC__ > 4
- std::cout << std::defaultfloat;
-#endif
- printTimeSinceEpoch();
- }
+ // output results
+ if( pid == 0 ) {
+ std::cout << "Overall timings (io, preamble, useful, postamble):\n"
+ << std::scientific;
+ std::cout << "Avg: " << total_times.io << ", " << total_times.preamble
+ << ", " << total_times.useful << ", " << total_times.postamble << "\n";
+ std::cout << "Min: " << min_times.io << ", " << min_times.preamble << ", "
+ << min_times.useful << ", " << min_times.postamble << "\n";
+ std::cout << "Max: " << max_times.io << ", " << max_times.preamble << ", "
+ << max_times.useful << ", " << max_times.postamble << "\n";
+ std::cout << "Std: " << sqrt( sdev.io ) << ", " << sqrt( sdev.preamble )
+ << ", " << sqrt( sdev.useful ) << ", " << sqrt( sdev.postamble ) << "\n";
+ #if __GNUC__ > 4
+ std::cout << std::defaultfloat;
+ #endif
+ printTimeSinceEpoch();
+ }
#else
- // write to file(?)
- (void)min_times;
- (void)max_times;
- (void)pid;
+ // we ran the benchmark, but may not have a way to output it in this case
+ // this currently only is touched by the #grb::banshee backend, which
+ // provides other timing mechanisms.
+ (void) min_times;
+ (void) max_times;
+ (void) pid;
#endif
- }
-
- template< typename U, enum Backend implementation = config::default_backend >
- static RC benchmark( void ( *grb_program )( const void *,
- const size_t,
- U & ), // user GraphBLAS program
- const void * data_in,
- const size_t in_size,
- U & data_out, // input & output data
- const size_t inner,
- const size_t outer,
- const size_t pid ) {
- const double inf = std::numeric_limits< double >::infinity();
- grb::utils::TimerResults total_times, min_times, max_times;
- grb::utils::TimerResults * sdev_times = new grb::utils::TimerResults[ outer ];
- total_times.set( 0 );
- min_times.set( inf );
- max_times.set( 0 );
-
- // outer loop
- for( size_t out = 0; out < outer; out++ ) {
- grb::utils::TimerResults inner_times;
- inner_times.set( 0 );
-
- // inner loop
- for( size_t in = 0; in < inner; in++ ) {
- data_out.times.set( 0 );
- ( *grb_program )( data_in, in_size, data_out );
- grb::collectives< implementation >::reduce( data_out.times.io, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.preamble, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.useful, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.postamble, 0, grb::operators::max< double >() );
- inner_times.accum( data_out.times );
- }
+ }
- // calculate performance stats
- benchmark_calc_inner( out, inner, inner_times, total_times, min_times, max_times, sdev_times );
+ /**
+ * Benchmarks a given ALP program.
+ *
+ * This variant applies to input data as a byte blob and output data as a
+ * user-defined POD struct.
+ *
+ * @tparam U Output type of the given user program.
+ * @tparam backend Which backend the program is using.
+ *
+ * @param[in] alp_program The use rogram to be benchmarked
+ * @param[in] data_in Input data as a raw data blob
+ * @param[in] in_size The size, in bytes, of the input data
+ * @param[out] out_data Output data
+ * @param[in] inner The number of inner repetitions of the benchmark
+ * @param[in] outer The number of outer repetitions of the benchmark
+ * @param[in] pid Unique ID of the calling user process
+ *
+ * @see benchmarking
+ *
+ * @ingroup benchmarking
+ */
+ template<
+ typename U,
+ enum Backend implementation = config::default_backend
+ >
+ static RC benchmark(
+ void ( *alp_program )( const void *, const size_t, U & ),
+ const void * data_in,
+ const size_t in_size,
+ U &data_out,
+ const size_t inner,
+ const size_t outer,
+ const size_t pid
+ ) {
+ const double inf = std::numeric_limits< double >::infinity();
+ grb::utils::TimerResults total_times, min_times, max_times;
+ grb::utils::TimerResults * sdev_times =
+ new grb::utils::TimerResults[ outer ];
+ total_times.set( 0 );
+ min_times.set( inf );
+ max_times.set( 0 );
+
+ // outer loop
+ for( size_t out = 0; out < outer; ++out ) {
+ grb::utils::TimerResults inner_times;
+ inner_times.set( 0 );
+
+ // inner loop
+ for( size_t in = 0; in < inner; in++ ) {
+ data_out.times.set( 0 );
+ ( *alp_program )( data_in, in_size, data_out );
+ grb::collectives< implementation >::reduce(
+ data_out.times.io, 0, grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce(
+ data_out.times.preamble, 0, grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce(
+ data_out.times.useful, 0, grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce(
+ data_out.times.postamble, 0, grb::operators::max< double >() );
+ inner_times.accum( data_out.times );
+ }
+
+ // calculate performance stats
+ benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
+ max_times, sdev_times );
#ifndef _GRB_NO_STDIO
- // give experiment output line
- if( pid == 0 ) {
- std::cout << "Outer iteration #" << out
- << " timings (io, preamble, useful, "
- "postamble, time since epoch): ";
- std::cout << inner_times.io << ", " << inner_times.preamble << ", " << inner_times.useful << ", " << inner_times.postamble << ", ";
- printTimeSinceEpoch( false );
- }
+ // give experiment output line
+ if( pid == 0 ) {
+ std::cout << "Outer iteration #" << out << " timings (io, preamble, "
+ << "useful, postamble, time since epoch): ";
+ std::cout << inner_times.io << ", " << inner_times.preamble << ", "
+ << inner_times.useful << ", " << inner_times.postamble << ", ";
+ printTimeSinceEpoch( false );
+ }
#endif
- // pause for next outer loop
- if( sleep( 1 ) != 0 ) {
+ // pause for next outer loop
+ if( sleep( 1 ) != 0 ) {
#ifndef _GRB_NO_STDIO
- std::cerr << "Sleep interrupted, assume benchmark is "
- "unreliable and exiting.\n";
+ std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
+ << "exiting.\n";
#endif
- abort();
+ abort();
+ }
}
- }
- // calculate performance stats
- benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, pid );
- delete[] sdev_times;
+ // calculate performance stats
+ benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
+ pid );
+ delete [] sdev_times;
- return SUCCESS;
- }
+ return SUCCESS;
+ }
- template< typename T, typename U, enum Backend implementation = config::default_backend >
- static RC benchmark( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T & data_in,
- U & data_out, // input & output data
- const size_t inner,
- const size_t outer,
- const size_t pid ) {
- const double inf = std::numeric_limits< double >::infinity();
- grb::utils::TimerResults total_times, min_times, max_times;
- grb::utils::TimerResults * sdev_times = new grb::utils::TimerResults[ outer ];
- total_times.set( 0 );
- min_times.set( inf );
- max_times.set( 0 );
-
- // outer loop
- for( size_t out = 0; out < outer; out++ ) {
- grb::utils::TimerResults inner_times;
- inner_times.set( 0 );
-
- // inner loop
- for( size_t in = 0; in < inner; in++ ) {
- data_out.times.set( 0 );
-
- ( *grb_program )( data_in, data_out );
- grb::collectives< implementation >::reduce( data_out.times.io, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.preamble, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.useful, 0, grb::operators::max< double >() );
- grb::collectives< implementation >::reduce( data_out.times.postamble, 0, grb::operators::max< double >() );
- inner_times.accum( data_out.times );
- }
-
- // calculate performance stats
- benchmark_calc_inner( out, inner, inner_times, total_times, min_times, max_times, sdev_times );
+ /**
+ * Benchmarks a given ALP program.
+ *
+ * This variant applies to input data as a user-defined POD struct and
+ * output data as a user-defined POD struct.
+ *
+ * @tparam T Input type of the given user program.
+ * @tparam U Output type of the given user program.
+ *
+ * @param[in] alp_program The use rogram to be benchmarked
+ * @param[in] data_in Input data as a raw data blob
+ * @param[in] in_size The size, in bytes, of the input data
+ * @param[out] out_data Output data
+ * @param[in] inner The number of inner repetitions of the benchmark
+ * @param[in] outer The number of outer repetitions of the benchmark
+ * @param[in] pid Unique ID of the calling user process
+ *
+ * @see benchmarking
+ *
+ * @ingroup benchmarking
+ */
+ template<
+ typename T, typename U,
+ enum Backend implementation = config::default_backend
+ >
+ static RC benchmark(
+ void ( *alp_program )( const T &, U & ),
+ const T &data_in,
+ U &data_out,
+ const size_t inner,
+ const size_t outer,
+ const size_t pid
+ ) {
+ const double inf = std::numeric_limits< double >::infinity();
+ grb::utils::TimerResults total_times, min_times, max_times;
+ grb::utils::TimerResults * sdev_times =
+ new grb::utils::TimerResults[ outer ];
+ total_times.set( 0 );
+ min_times.set( inf );
+ max_times.set( 0 );
+
+ // outer loop
+ for( size_t out = 0; out < outer; ++out ) {
+ grb::utils::TimerResults inner_times;
+ inner_times.set( 0 );
+
+ // inner loop
+ for( size_t in = 0; in < inner; ++in ) {
+ data_out.times.set( 0 );
+
+ ( *alp_program )( data_in, data_out );
+ grb::collectives< implementation >::reduce( data_out.times.io, 0,
+ grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce( data_out.times.preamble, 0,
+ grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce( data_out.times.useful, 0,
+ grb::operators::max< double >() );
+ grb::collectives< implementation >::reduce( data_out.times.postamble, 0,
+ grb::operators::max< double >() );
+ inner_times.accum( data_out.times );
+ }
+
+ // calculate performance stats
+ benchmark_calc_inner( out, inner, inner_times, total_times, min_times,
+ max_times, sdev_times );
#ifndef _GRB_NO_STDIO
- // give experiment output line
- if( pid == 0 ) {
- std::cout << "Outer iteration #" << out
- << " timings (io, preamble, useful, "
- "postamble, time since epoch): "
- << std::fixed;
- std::cout << inner_times.io << ", " << inner_times.preamble << ", " << inner_times.useful << ", " << inner_times.postamble << ", ";
- printTimeSinceEpoch( false );
- std::cout << std::scientific;
- }
+ // give experiment output line
+ if( pid == 0 ) {
+ std::cout << "Outer iteration #" << out << " timings "
+ << "(io, preamble, useful, postamble, time since epoch): " << std::fixed
+ << inner_times.io << ", " << inner_times.preamble << ", "
+ << inner_times.useful << ", " << inner_times.postamble << ", ";
+ printTimeSinceEpoch( false );
+ std::cout << std::scientific;
+ }
#endif
- // pause for next outer loop
- if( sleep( 1 ) != 0 ) {
+ // pause for next outer loop
+ if( sleep( 1 ) != 0 ) {
#ifndef _GRB_NO_STDIO
- std::cerr << "Sleep interrupted, assume benchmark is "
- "unreliable and exiting.\n";
+ std::cerr << "Sleep interrupted, assume benchmark is unreliable; "
+ << "exiting.\n";
#endif
- abort();
+ abort();
+ }
}
+
+ // calculate performance stats
+ benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times,
+ pid );
+ delete[] sdev_times;
+
+ return SUCCESS;
}
- // calculate performance stats
- benchmark_calc_outer( outer, total_times, min_times, max_times, sdev_times, pid );
- delete[] sdev_times;
- return SUCCESS;
- }
+ public:
- public:
- BenchmarkerBase() {
+ BenchmarkerBase() {
#ifndef _GRB_NO_STDIO
- printTimeSinceEpoch();
+ printTimeSinceEpoch();
#endif
- }
+ }
+
};
} // namespace internal
/**
- * Benchmarking function, called from an exec function.
- * Takes the grbProgram and its input and output data and accumultes times
- * given in the output structure.
+ * A class that follows the API of the #grb::Launcher, but instead of launching
+ * the given ALP program once, it launches it multiple times while benchmarking
+ * its execution times.
+ *
+ * @ingroup benchmarking
+ * @see benchmarking
*/
template< enum EXEC_MODE mode, enum Backend implementation >
class Benchmarker {
public :
- Benchmarker( size_t process_id = 0, // user process ID
- size_t nprocs = 1, // total number of user processes
- std::string hostname = "localhost", // one of the user process hostnames
- std::string port = "0" // a free port at hostname
- ) { (void)process_id; (void)nprocs; (void)hostname; (void)port;
+ /**
+ * Constructs an instance of the benchmarker class.
+ *
+ * @param[in] process_id A unique ID for the calling user process.
+ * @param[in] nprocs The total number of user processes participating in
+ * the benchmark. The given \a process_id must be
+ * strictly smaller than this given value.
+ * @param[in] hostname The hostname where one of the user processes
+ * participating in the benchmark resides.
+ * @param[in] port A free TCP/IP port at the host corresponding to
+ * the given \a hostname.
+ *
+ * The \a hostname and \a port arguments are unused if \a nprocs equals one.
+ *
+ * All arguments are optional-- their defaults are:
+ * - 0 for \a process_id,
+ * - 1 for \a nprocs,
+ * - \em localhost for \a hostname, and
+ * - 0 for \a port.
+ *
+ * This constructor may throw the same errors as #grb::Launcher.
+ *
+ * @see #grb::Launcher
+ * @see benchmarking
+ *
+ * \internal This is the base class which should be overridden by given
+ * backend implementations.
+ */
+ Benchmarker(
+ const size_t process_id = 0,
+ size_t nprocs = 1,
+ std::string hostname = "localhost",
+ std::string port = "0"
+ ) {
+ (void)process_id; (void)nprocs; (void)hostname; (void)port;
#ifndef _GRB_NO_EXCEPTIONS
- throw std::logic_error( "Benchmarker class called with unsupported "
- "mode or implementation" );
+ throw std::logic_error( "Benchmarker class called with unsupported mode or "
+ "implementation" );
#endif
}
- template< typename T, typename U >
- RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T & data_in,
- U & data_out, // input & output data
- const size_t inner,
- const size_t outer,
- const bool broadcast = false ) const {
- (void)grb_program;
- (void)data_in;
- (void)data_out;
- (void)inner;
- (void)outer;
- (void)broadcast;
- // stub implementation, should be overridden by specialised implementation,
- // so return error code
- return PANIC;
- }
-
- template< typename U >
- RC exec( void ( *grb_program )( const void *, const size_t, U & ), const void * data_in, const size_t in_size, U & data_out, const size_t inner, const size_t outer, const bool broadcast = false )
- const {
- (void)grb_program;
- (void)data_in;
- (void)in_size;
- (void)data_out;
- (void)inner;
- (void)outer;
- (void)broadcast;
- return PANIC;
- }
+ /**
+ * Benchmarks a given ALP program.
+ *
+ * This variant applies to input data as a user-defined POD struct and
+ * output data as a user-defined POD struct.
+ *
+ * @tparam T Input type of the given user program.
+ * @tparam U Output type of the given user program.
+ *
+ * @param[in] alp_program The ALP program to be benchmarked
+ * @param[in] data_in Input data as a raw data blob
+ * @param[out] data_out Output data
+ * @param[in] inner The number of inner repetitions of the benchmark
+ * @param[in] outer The number of outer repetitions of the benchmark
+ * @param[in] broadcast An optional argument that dictates whether the
+ * \a data_in argument should be broadcast across all
+ * user processes participating in the benchmark,
+ * prior to \em each invocation of \a alp_program.
+ *
+ * The default value of \a broadcast is false .
+ *
+ * @returns #grb::SUCCESS The benchmarking has completed successfully.
+ * @returns #grb::FAILED An error during benchmarking has occurred. The
+ * benchmark attempt could be retried, and an error
+ * for the failure is reported to the standard error
+ * stream.
+ * @returns #grb::PANIC If an unrecoverable error was encountered while
+ * starting the benchmark, while benchmarking, or
+ * while aggregating the final results.
+ *
+ * @see benchmarking
+ *
+ * \internal This is the base implementation that should be specialised by
+ * each backend separately.
+ */
+ template< typename T, typename U >
+ RC exec(
+ void ( *alp_program )( const T &, U & ),
+ const T &data_in,
+ U &data_out,
+ const size_t inner,
+ const size_t outer,
+ const bool broadcast = false
+ ) const {
+ (void) alp_program;
+ (void) data_in;
+ (void) data_out;
+ (void) inner;
+ (void) outer;
+ (void) broadcast;
+
+ // stub implementation, should be overridden by specialised implementation.
+ // furthermore, it should be impossible to call this function without
+ // triggering an exception during construction of this stub class, so we
+ // just return PANIC here
+ return PANIC;
+ }
- /**
- * Releases all GraphBLAS resources. After a call to this function, no
- * GraphBLAS library functions may be called any longer.
- *
- * @return SUCCESS A call to this function may never fail.
- */
- static RC finalize() {
- return Launcher< mode, implementation >::finalize();
- }
+ /**
+ * Benchmarks a given ALP program.
+ *
+ * This variant applies to input data as a byte blob and output data as a
+ * user-defined POD struct.
+ *
+ * @tparam U Output type of the given user program.
+ *
+ * @param[in] alp_program The use rogram to be benchmarked
+ * @param[in] data_in Input data as a raw data blob
+ * @param[in] in_size The size, in bytes, of the input data
+ * @param[out] data_out Output data
+ * @param[in] inner The number of inner repetitions of the benchmark
+ * @param[in] outer The number of outer repetitions of the benchmark
+ * @param[in] broadcast An optional argument that dictates whether the
+ * \a data_in argument should be broadcast across all
+ * user processes participating in the benchmark,
+ * prior to \em each invocation of \a alp_program.
+ *
+ * The default value of \a broadcast is false .
+ *
+ * @returns #grb::SUCCESS The benchmarking has completed successfully.
+ * @returns #grb::ILLEGAL If \a in_size is nonzero but \a data_in compares
+ * equal to nullptr .
+ * @returns #grb::FAILED An error during benchmarking has occurred. The
+ * benchmark attempt could be retried, and an error
+ * for the failure is reported to the standard error
+ * stream.
+ * @returns #grb::PANIC If an unrecoverable error was encountered while
+ * starting the benchmark, while benchmarking, or
+ * while aggregating the final results.
+ *
+ * @see benchmarking
+ *
+ * \internal This is the base implementation that should be specialised by
+ * each backend separately.
+ */
+ template< typename U >
+ RC exec(
+ void ( *alp_program )( const void *, const size_t, U & ),
+ const void * data_in, const size_t in_size,
+ U &data_out,
+ const size_t inner, const size_t outer,
+ const bool broadcast = false
+ ) const {
+ (void) alp_program;
+ (void) data_in;
+ (void) in_size;
+ (void) data_out;
+ (void) inner;
+ (void) outer;
+ (void) broadcast;
+
+ // stub implementation, should be overridden by specialised implementation.
+ // furthermore, it should be impossible to call this function without
+ // triggering an exception during construction of this stub class, so we
+ // just return PANIC here
+ return PANIC;
+ }
-}; // namespace grb
+ /**
+ * Releases all ALP resources.
+ *
+ * Calling this function is equivalent to calling #grb::Launcher::finalize.
+ *
+ * After a call to this function, no further ALP programs may be benchmarked
+ * nor launched-- i.e., both the #grb::Launcher and #grb::Benchmarker
+ * functionalities many no longer be used.
+ *
+ * A well-behaving program calls this function, or #grb::Launcher::finalize,
+ * exactly once and just before exiting (or just before the guaranteed last
+ * invocation of an ALP program).
+ *
+ * @return #grb::SUCCESS The resources have successfully and permanently been
+ * released.
+ * @return #grb::PANIC An unrecoverable error has been encountered and the
+ * user program is encouraged to exit as quickly as
+ * possible. The state of the ALP library has become
+ * undefined and should no longer be used.
+ *
+ * \internal This is the base implementation that should be specialised by
+ * each backend separately.
+ */
+ static RC finalize() {
+ return Launcher< mode, implementation >::finalize();
+ }
+
+ };
} // end namespace ``grb''
#endif // end _H_GRB_BENCH_BASE
+
diff --git a/include/graphblas/base/blas1.hpp b/include/graphblas/base/blas1.hpp
index e3d4649af..9d451c1f6 100644
--- a/include/graphblas/base/blas1.hpp
+++ b/include/graphblas/base/blas1.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-1 API
+ *
* @author A. N. Yzelman
* @date 5th of December 2016
*/
@@ -38,160 +42,3573 @@
namespace grb {
/**
- * \defgroup BLAS1 The Level-1 ALP/GraphBLAS routines
- *
- * A collection of functions that allow ALP/GraphBLAS operators, monoids, and
- * semirings work on a mix of zero-dimensional and one-dimensional containers;
- * i.e., allows various linear algebra operations on scalars and objects of
- * type #grb::Vector.
+ * \defgroup BLAS1 Level-1 Primitives
+ * \ingroup GraphBLAS
+ *
+ * A collection of functions that allow ALP/GraphBLAS operators, monoids, and
+ * semirings work on a mix of zero-dimensional and one-dimensional containers;
+ * i.e., allows various linear algebra operations on scalars and objects of
+ * type #grb::Vector.
+ *
+ * All functions return an error code of the enum-type #grb::RC.
+ *
+ * Primitives which produce vector output:
+ * -# #grb::set (three variants);
+ * -# #grb::foldr (in-place reduction to the right, scalar-to-vector and
+ * vector-to-vector);
+ * -# #grb::foldl (in-place reduction to the left, scalar-to-vector and
+ * vector-to-vector);
+ * -# #grb::eWiseApply (out-of-place application of a binary function);
+ * -# #grb::eWiseAdd (in-place addition of two vectors, a vector and a
+ * scalar, into a vector); and
+ * -# #grb::eWiseMul (in-place multiplication of two vectors, a vector and a
+ * scalar, into a vector).
+ *
+ * \note When #grb::eWiseAdd or #grb::eWiseMul using two input scalars is
+ * required, consider forming first the resulting scalar using level-0
+ * primitives, and then using #grb::set, #grb::foldl, or #grb::foldr, as
+ * appropriate.
+ *
+ * Primitives that produce scalar output:
+ * -# #grb::foldr (reduction to the right, vector-to-scalar);
+ * -# #grb::foldl (reduction to the left, vector-to-scalar).
+ *
+ * Primitives that do not require an operator, monoid, or semiring:
+ * -# #grb::set (three variants).
+ *
+ * Primitives that could take an operator (see #grb::operators):
+ * -# #grb::foldr, #grb::foldl, and #grb::eWiseApply.
+ * Such operators typically can only be applied on \em dense vectors, i.e.,
+ * vectors with #grb::nnz equal to its #grb::size. Operations on sparse
+ * vectors require an intepretation of missing vector elements, which monoids
+ * or semirings provide.
+ *
+ * Therefore, all aforementioned functions are also defined for monoids instead
+ * of operators.
+ *
+ * The following functions are defined for monoids and semirings, but not for
+ * operators alone:
+ * -# #grb::eWiseAdd (in-place addition).
+ *
+ * The following functions require a semiring, and are not defined for
+ * operators or monoids alone:
+ * -# #grb::dot (in-place reduction of two vectors into a scalar); and
+ * -# #grb::eWiseMul (in-place multiplication).
+ *
+ * Sometimes, operations that are defined for semirings we would sometimes also
+ * like enabled on \em improper semirings. ALP/GraphBLAS statically checks most
+ * properties required for composing proper semirings, and as such, attempts to
+ * compose improper ones will result in a compilation error. In such cases, we
+ * allow to pass an additive monoid and a multiplicative operator instead of a
+ * semiring. The following functions allow this:
+ * -# #grb::dot, #grb::eWiseAdd, #grb::eWiseMul.
+ * The given multiplicative operator can be any binary operator, and in
+ * particular does not need to be associative.
+ *
+ * The algebraic structures lost with improper semirings typically correspond to
+ * distributivity, zero being an annihilator to multiplication, as well as the
+ * concept of \em one. Due to the latter lost structure, the above functions on
+ * impure semirings are \em not defined for pattern inputs.
+ *
+ * \warning I.e., any attempt to use containers of the form
+ * \code
+ * grb::Vector
+ * grb::Matrix
+ * \endcode
+ * with an improper semiring will result in a compile-time error.
+ *
+ * \note Pattern containers are perfectly fine to use with proper semirings.
+ *
+ * \warning If an improper semiring does not have the property that the zero
+ * identity acts as an annihilator over the multiplicative operator,
+ * then the result of #grb::eWiseMul may be unintuitive. Please take
+ * great care in the use of improper semrings.
+ *
+ * For fusing multiple BLAS-1 style operations on any number of inputs and
+ * outputs, users can pass their own operator function to be executed for
+ * every index \a i.
+ * -# grb::eWiseLambda.
+ * This requires manual application of operators, monoids, and/or semirings
+ * via level-0 interface -- see #grb::apply, #grb::foldl, and #grb::foldr.
+ *
+ * For all of these functions, the element types of input and output types
+ * do not have to match the domains of the given operator, monoid, or
+ * semiring unless the #grb::descriptors::no_casting descriptor was passed.
+ *
+ * An implementation, whether blocking or non-blocking, should have clear
+ * performance semantics for every sequence of graphBLAS calls, no matter
+ * whether those are made from sequential or parallel contexts. Backends
+ * may define different performance semantics depending on which #grb::Phase
+ * primitives execute in.
+ *
+ * @{
+ */
+
+ /**
+ * A standard vector to use for mask parameters.
+ *
+ * Indicates that no mask shall be used.
+ *
+ * \internal Do not use this symbol within backend implementations.
+ */
+ #define NO_MASK Vector< bool >( 0 )
+
+ /**
+ * Computes \f$ z = \alpha \odot \beta \f$, out of place, operator version.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * Specialisation scalar inputs, unmasked operator version.
+ *
+ * A call to this function is equivalent to the following code:
+ *
+ * \code
+ * typename OP::D3 tmp;
+ * grb::apply( tmp, x, y, op );
+ * grb::set( z, tmp, phase );
+ * \endcode
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpASS_base = false;
+ assert( should_not_call_eWiseApplyOpASS_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) beta;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot \beta \f$, out of place, operator and masked
+ * version.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the output mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The ouptut mask.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * Specialisation scalar inputs, masked operator version.
+ *
+ * A call to this function is equivalent to the following code:
+ *
+ * \code
+ * typename OP::D3 tmp;
+ * grb::apply( tmp, x, y, op );
+ * grb::set( z, mask, tmp, phase );
+ * \endcode
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAMSS_base = false;
+ assert( should_not_call_eWiseApplyOpAMSS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) beta;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot \beta \f$, out of place, monoid version.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] monoid The monoid with underlying operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * Specialisation scalar inputs, unmasked monoid version.
+ *
+ * A call to this function is equivalent to the following code:
+ *
+ * \code
+ * typename OP::D3 tmp;
+ * grb::apply( tmp, x, y, monoid.getOperator() );
+ * grb::set( z, tmp, phase );
+ * \endcode
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonASS_base = false;
+ assert( should_not_call_eWiseApplyMonASS_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) beta;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot \beta \f$, out of place, masked monoid
+ * version.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the output mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The output mask.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] monoid The monoid with underlying operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * Specialisation for scalar inputs, masked monoid version.
+ *
+ * A call to this function is equivalent to the following code:
+ *
+ * \code
+ * typename OP::D3 tmp;
+ * grb::apply( tmp, alpha, beta, monoid.getOperator() );
+ * grb::set( z, mask, tmp, phase );
+ * \endcode
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonAMSS_base = false;
+ assert( should_not_call_eWiseApplyMonAMSS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) beta;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot y \f$, out of place, operator version.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+ * are removed. Entries \a i for which \a y has no nonzero will be skipped.
+ *
+ * After a successful call to this primitive, the sparsity structure of \a z
+ * shall match that of \a y.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] y The right-hand input vector.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpASA_base = false;
+ assert( should_not_call_eWiseApplyOpASA_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) y;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot y \f$, out of place, masked operator version.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+ * are removed. Entries \a i for which \a y has no nonzero will be skipped, as
+ * will entries \a i for which \a mask evaluates false .
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The output mask.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] y The right-hand input vector.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAMSA_base = false;
+ assert( should_not_call_eWiseApplyOpAMSA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) y;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot y \f$, out of place, monoid version.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+ * are removed.
+ *
+ * After a successful call to this primitive, \a z shall be dense.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] y The right-hand input vector.
+ * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonoidASA_base = false;
+ assert( should_not_call_eWiseApplyMonoidASA_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) y;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = \alpha \odot y \f$, out of place, masked monoid variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = \alpha \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ \alpha \odot y_i \f$. Any old entries of \a z
+ * are removed. Entries \a i for which \a mask evaluates false will be
+ * skipped.
+ *
+ * After a successful call to this primitive, the sparsity structure of \a z
+ * shall match that of \a mask (after interpretation).
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the output mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[out] mask The output mask.
+ * @param[in] alpha The left-hand input scalar.
+ * @param[in] y The right-hand input vector.
+ * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, a y and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonoidAMSA_base = false;
+ assert( should_not_call_eWiseApplyMonoidAMSA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) y;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot \beta \f$, out of place, operator variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot \beta \f$. Any old entries
+ * of \a z are removed.
+ *
+ * Entries \a i for which no nonzero exists in \a x are skipped. Therefore,
+ * after a successful call to this primitive, the nonzero structure of \a z
+ * will match that of \a x.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] x The left-hand input vector.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator, base\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAAS_base = false;
+ assert( should_not_call_eWiseApplyOpAAS_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) beta;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot \beta \f$, out of place, masked operator variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x .* \beta \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot \beta \f$. Any old entries
+ * of \a z are removed.
+ *
+ * Entries \a i for which no nonzero exists in \a x are skipped. Entries \a i
+ * for which the mask evaluates false are skipped as well.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the output vector.
+ * @tparam MaskType The value type of the output mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The output mask.
+ * @param[in] x The left-hand input vector.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, operator, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAMAS_base = false;
+ assert( should_not_call_eWiseApplyOpAMAS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) beta;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot \beta \f$, out of place, monoid variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot \beta \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ x_i \odot \beta \f$. Any old entries of \a z
+ * are removed.
+ *
+ * After a successful call to this primitive, \a z shall be dense.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] x The left-hand input vector.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, monoid, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonoidAAS_base = false;
+ assert( should_not_call_eWiseApplyMonoidAAS_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) beta;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot \beta \f$, out of place, masked monoid variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot \beta \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all indices \a i of \a z, its element \f$ z_i \f$ after the call to this
+ * function completes equals \f$ x_i \odot \beta \f$. Any old entries of \a z
+ * are removed. Entries \a i for which \a mask evaluates false will be
+ * skipped.
+ *
+ * After a successful call to this primitive, the sparsity structure of \a z
+ * matches that of \a mask (after interpretation).
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Equal to
+ * descriptors::no_operation if left unspecified.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[out] mask The output mask.
+ * @param[in] x The left-hand input vector.
+ * @param[in] beta The right-hand input scalar.
+ * @param[in] monoid The monoid that provides the operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, monoid, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonoidAMAS_base = false;
+ assert( should_not_call_eWiseApplyMonoidAMAS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) beta;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot y \f$, out of place, operator variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+ * of \a z are removed. Entries \a i which have no nonzero in either \a x or
+ * \a y are skipped.
+ *
+ * After a successful call to this primitive, the nonzero structure of \a z
+ * will match that of the intersection of \a x and \a y.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] x The left-hand input vector.
+ * @param[in] y The right-hand input vector.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAAA_base = false;
+ assert( should_not_call_eWiseApplyOpAAA_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) y;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot y \f$, out of place, masked operator variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+ * of \a z are removed. Entries \a i which have no nonzero in either \a x or
+ * \a y are skipped, as will entries \a i for which \a mask evaluates
+ * false .
+ *
+ * After a successful call to this primitive, the nonzero structure of \a z
+ * will match that of the intersection of \a x and \a y.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam OP The operator to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the output mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The output mask.
+ * @param[in] x The left-hand input vector.
+ * @param[in] y The right-hand input vector.
+ * @param[in] op The operator \f$ \odot \f$.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+ * \a z do not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], operator, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAMAA_base = false;
+ assert( should_not_call_eWiseApplyOpAMAA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) y;
+ (void) op;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot y \f$, out of place, monoid variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+ * of \a z are removed.
+ *
+ * After a successful call to this primitive, the nonzero structure of \a z
+ * will match that of the union of \a x and \a y. An implementing backend may
+ * skip processing indices \a i that are not in the union of the nonzero
+ * structure of \a x and \a y.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] x The left-hand input vector.
+ * @param[in] y The right-hand input vector.
+ * @param[in] monoid The monoid structure that \f$ \odot \f$ corresponds to.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyOpAMAA_base = false;
+ assert( should_not_call_eWiseApplyOpAMAA_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) y;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Computes \f$ z = x \odot y \f$, out of place, masked monoid variant.
+ *
+ * Calculates the element-wise operation on one scalar to elements of one
+ * vector, \f$ z = x \odot y \f$, using the given operator. The input and
+ * output vectors must be of equal length.
+ *
+ * For all valid indices \a i of \a z, its element \f$ z_i \f$ after the call
+ * to this function completes equals \f$ x_i \odot y_i \f$. Any old entries
+ * of \a z are removed. Entries \a i for which \a mask evaluates false
+ * will be skipped.
+ *
+ * \note When applying element-wise operators on sparse vectors using
+ * semirings, there is a difference between interpreting missing values
+ * as an annihilating identity or as a neutral identity-- intuitively,
+ * such identities are known as `zero' or `one', respectively. As a
+ * consequence, there are two different variants for element-wise
+ * operations whose names correspond to their intuitive meanings:
+ * - #grb::eWiseAdd (neutral), and
+ * - #grb::eWiseMul (annihilating).
+ * The above two primitives require a semiring. The same functionality is
+ * provided by #grb::eWiseApply depending on whether a monoid or operator
+ * is provided:
+ * - #grb::eWiseApply using monoids (neutral),
+ * - #grb::eWiseApply using operators (annihilating).
+ *
+ * \note However, #grb::eWiseAdd and #grb::eWiseMul provide in-place semantics,
+ * while #grb::eWiseApply does not.
+ *
+ * \note An #grb::eWiseAdd with some semiring and a #grb::eWiseApply using its
+ * additive monoid thus are equivalent if operating when operating on
+ * empty outputs.
+ *
+ * \note An #grb::eWiseMul with some semiring and a #grb::eWiseApply using its
+ * multiplicative operator thus are equivalent when operating on empty
+ * outputs.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Monoid The monoid to use.
+ * @tparam InputType1 The value type of the left-hand vector.
+ * @tparam InputType2 The value type of the right-hand scalar.
+ * @tparam OutputType The value type of the ouput vector.
+ * @tparam MaskType The value type of the mask vector.
+ *
+ * @param[out] z The output vector.
+ * @param[in] mask The output mask.
+ * @param[in] x The left-hand input vector.
+ * @param[in] y The right-hand input vector.
+ * @param[in] monoid The monoid structure that \f$ \odot \f$ corresponds to.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], monoid, base)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseApplyMonoidAMAA_base = false;
+ assert( should_not_call_eWiseApplyMonoidAMAA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) y;
+ (void) monoid;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+ * under a given semiring.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] x The left-hand input vector of type \a InputType1. This may
+ * be a sparse vector.
+ * @param[in] y The right-hand input vector of type \a InputType2. This may
+ * be a sparse vector.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * \note There is also a masked variant of #grb::eWiseAdd, as well as variants
+ * where \a x and/or \a y are scalars.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+ * not match. All input data containers are left
+ * untouched; it will be as though this call was never
+ * made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * grb::descriptors::no_operation, grb::descriptors::no_casting,
+ * grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- [T2] + [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAAA_base = false;
+ assert( should_not_call_eWiseAddAAA_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+ * given semiring.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2. This may
+ * be a sparse vector.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+ * match. All input data containers are left untouched;
+ * it will be as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * grb::descriptors::no_operation, grb::descriptors::no_casting,
+ * grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- T2 + [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddASA_base = false;
+ assert( should_not_call_eWiseAddASA_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+ * given semiring.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] x The left-hand input vector of type \a InputType1. This may
+ * be a sparse vector.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+ * match. All input data containers are left untouched;
+ * it will be as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * grb::descriptors::no_operation, grb::descriptors::no_casting,
+ * grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- [T2] + T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAAS_base = false;
+ assert( should_not_call_eWiseAddAAS_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+ * given semiring.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * grb::descriptors::no_operation, grb::descriptors::no_casting,
+ * grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- T2 + T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddASS_base = false;
+ assert( should_not_call_eWiseAddASS_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition of two vectors, \f$ z += x .+ y \f$,
+ * under a given semiring, masked variant.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ * @tparam MaskType The nonzero type of the output mask vector.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] mask The output mask vector of type \a MaskType.
+ * @param[in] x The left-hand input vector of type \a InputType1. This may
+ * be a sparse vector.
+ * @param[in] y The right-hand input vector of type \a InputType2. This may
+ * be a sparse vector.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * \note There are also variants where \a x and/or \a y are scalars, as well
+ * as unmasked variants.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+ * \a z do not match. All input data containers are left
+ * untouched; it will be as though this call was never
+ * made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- [T2] + [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAMAA_base = false;
+ assert( should_not_call_eWiseAddAMAA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += \alpha .+ y \f$, under a
+ * given semiring, masked variant.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ * @tparam MaskType The nonzero type of the output mask vector.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] mask The output mask.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2. This may
+ * be a sparse vector.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a y, and \a z do
+ * not match. All input data containers are left
+ * untouched; it will be as though this call was never
+ * made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- T2 + [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAMSA_base = false;
+ assert( should_not_call_eWiseAddAMSA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += x .+ \beta \f$, under a
+ * given semiring, masked variant.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ * @tparam MaskType The nonzero type of the output mask vector.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] mask The output mask.
+ * @param[in] x The left-hand input vector of type \a InputType1. This may
+ * be a sparse vector.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, and \a z do
+ * not match. All input data containers are left
+ * untouched; it will be as though this call was never
+ * made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- [T2] + T3), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAMAS_base = false;
+ assert( should_not_call_eWiseAddAMAS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the element-wise addition, \f$ z += \alpha .+ \beta \f$, under a
+ * given semiring, masked variant.
+ *
+ * \note This is an in-place operation.
+ *
+ * \deprecated This function has been deprecated since v0.5. It may be removed
+ * at latest at v1.0 of ALP/GraphBLAS-- or any time earlier.
+ *
+ * \note A call to this function is equivalent to two in-place fold operations
+ * using the additive monoid of the given semiring. Please update any
+ * code that calls #grb::eWiseAdd with such a sequence as soon as
+ * possible.
+ *
+ * \note We may consider providing this function as an algorithm in the
+ * #grb::algorithms namespace, similar to #grb::algorithms::mpv. Please
+ * let the maintainers know if you would prefer such a solution over
+ * outright removal and replacement with two folds.
+ *
+ * @tparam descr The descriptor to be used. Optional; default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise addition
+ * on.
+ * @tparam InputType1 The left-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam InputType2 The right-hand side input type to the additive operator
+ * of the \a ring.
+ * @tparam OutputType The result type of the additive operator of the
+ * \a ring.
+ * @tparam MaskType The nonzero type of the output mask vector.
+ *
+ * @param[out] z The output vector of type \a OutputType. This may be a
+ * sparse vector.
+ * @param[in] mask The output mask.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH If \a mask and \a z do not have the same size.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the third domain of
+ * \a ring must match \a InputType1, 2) the fourth domain of \a ring must match
+ * \a InputType2, 3) the fourth domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseAdd ([T1] <- T2 + T3), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseAddAMSS_base = false;
+ assert( should_not_call_eWiseAddAMSS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+ * under a given semiring.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output type.
+ *
+ * @param[out] z The output vector of type \a OutputType.
+ * @param[in] x The left-hand input vector of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x, \a y, and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting, and
+ * - #grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- [T2] * [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAAA_base = false;
+ assert( should_not_call_eWiseMulAAA_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of a scalar and vector,
+ * \f$ z += \alpha .* y \f$, under a given semiring.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output type.
+ *
+ * @param[out] z The output vector of type \a OutputType.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a y and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting, and
+ * - #grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- T2 * [T3]), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulASA_base = false;
+ assert( should_not_call_eWiseMulASA_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of a vector and scalar,
+ * \f$ z += x .* \beta \f$, under a given semiring.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output type.
+ *
+ * @param[out] z The output vector of type \a OutputType.
+ * @param[in] x The left-hand input vector of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a x and \a z do not
+ * match. All input data containers are left untouched
+ * if this exit code is returned; it will be as though
+ * this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting, and
+ * - #grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- [T2] * T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAAS_base = false;
+ assert( should_not_call_eWiseMulAAS_base );
+#endif
+ (void) z;
+ (void) x;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of two scalars,
+ * \f$ z += \alpha .* \beta \f$, under a given semiring.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output type.
+ *
+ * @param[out] z The output vector of type \a OutputType.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting, and
+ * - #grb::descriptors::dense.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- T2 * T3), unmasked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulASS_base = false;
+ assert( should_not_call_eWiseMulASS_base );
+#endif
+ (void) z;
+ (void) alpha;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of two vectors, \f$ z += x .* y \f$,
+ * under a given semiring, masked variant.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output vector type.
+ * @tparam MaskType The output mask type.
+ *
+ * @param[in,out] z The output vector of type \a OutputType.
+ * @param[in] mask The ouput mask of type \a MaskType.
+ * @param[in] x The left-hand input vector of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x, \a y, and
+ * \a z do not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- [T2] * [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAMAA_base = false;
+ assert( should_not_call_eWiseMulAMAA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of a scalar and vector,
+ * \f$ z += \alpha .* y \f$, under a given semiring, masked variant.
*
- * All functions return an error code of the enum-type #grb::RC.
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output vector type.
+ * @tparam MaskType The output mask type.
*
- * Primitives which produce vector output:
- * -# #grb::set (three variants);
- * -# #grb::foldr (in-place reduction to the right, scalar-to-vector and
- * vector-to-vector);
- * -# #grb::foldl (in-place reduction to the left, scalar-to-vector and
- * vector-to-vector);
- * -# #grb::eWiseApply (out-of-place application of a binary function);
- * -# #grb::eWiseAdd (in-place addition of two vectors, a vector and a
- * scalar, into a vector); and
- * -# #grb::eWiseMul (in-place multiplication of two vectors, a vector and a
- * scalar, into a vector).
+ * @param[in,out] z The output vector of type \a OutputType.
+ * @param[in] mask The ouput mask of type \a MaskType.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] y The right-hand input vector of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
*
- * \note When #grb::eWiseAdd or #grb::eWiseMul using two input scalars is
- * required, consider forming first the resulting scalar using level-0
- * primitives, and then using #grb::set, #grb::foldl, or #grb::foldr, as
- * appropriate.
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a y, and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
*
- * Primitives that produce scalar output:
- * -# #grb::foldr (reduction to the right, vector-to-scalar);
- * -# #grb::foldl (reduction to the left, vector-to-scalar).
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
*
- * Primitives that do not require an operator, monoid, or semiring:
- * -# #grb::set (three variants).
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
*
- * Primitives that could take an operator (see #grb::operators):
- * -# #grb::foldr, #grb::foldl, and #grb::eWiseApply.
- * Such operators typically can only be applied on \em dense vectors, i.e.,
- * vectors with #grb::nnz equal to its #grb::size. Operations on sparse
- * vectors require an intepretation of missing vector elements, which monoids
- * or semirings provide.
+ * \note Invalid descriptors will be ignored.
*
- * Therefore, all aforementioned functions are also defined for monoids instead
- * of operators.
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
*
- * The following functions are defined for monoids and semirings, but not for
- * operators alone:
- * -# #grb::eWiseAdd (in-place addition).
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
*
- * The following functions require a semiring, and are not defined for
- * operators or monoids alone:
- * -# #grb::dot (in-place reduction of two vectors into a scalar); and
- * -# #grb::eWiseMul (in-place multiplication).
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, backend, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- T2 * [T3]), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAMSA_base = false;
+ assert( should_not_call_eWiseMulAMSA_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) y;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * In-place element-wise multiplication of a vector and scalar,
+ * \f$ z += x .* \beta \f$, under a given semiring, masked variant.
*
- * Sometimes, operations that are defined for semirings we would sometimes also
- * like enabled on \em improper semirings. ALP/GraphBLAS statically checks most
- * properties required for composing proper semirings, and as such, attempts to
- * compose improper ones will result in a compilation error. In such cases, we
- * allow to pass an additive monoid and a multiplicative operator instead of a
- * semiring. The following functions allow this:
- * -# #grb::dot, #grb::eWiseAdd, #grb::eWiseMul.
- * The given multiplicative operator can be any binary operator, and in
- * particular does not need to be associative.
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output vector type.
+ * @tparam MaskType The output mask type.
*
- * The algebraic structures lost with improper semirings typically correspond to
- * distributivity, zero being an annihilator to multiplication, as well as the
- * concept of \em one. Due to the latter lost structure, the above functions on
- * impure semirings are \em not defined for pattern inputs.
+ * @param[in,out] z The output vector of type \a OutputType.
+ * @param[in] mask The output mask of type \a MaskType.
+ * @param[in] x The left-hand input vector of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
*
- * \warning I.e., any attempt to use containers of the form
- * \code
- * grb::Vector
- * grb::Matrix
- * \endcode
- * with an improper semiring will result in a compile-time error.
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH Whenever the dimensions of \a mask, \a x and \a z do
+ * not match. All input data containers are left
+ * untouched if this exit code is returned; it will be
+ * as though this call was never made.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
*
- * \note Pattern containers are perfectly fine to use with proper semirings.
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
*
- * \warning If an improper semiring does not have the property that the zero
- * identity acts as an annihilator over the multiplicative operator,
- * then the result of #grb::eWiseMul may be unintuitive. Please take
- * great care in the use of improper semrings.
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
*
- * For fusing multiple BLAS-1 style operations on any number of inputs and
- * outputs, users can pass their own operator function to be executed for
- * every index \a i.
- * -# grb::eWiseLambda.
- * This requires manual application of operators, monoids, and/or semirings
- * via level-0 interface -- see #grb::apply, #grb::foldl, and #grb::foldr.
+ * \note Invalid descriptors will be ignored.
*
- * For all of these functions, the element types of input and output types
- * do not have to match the domains of the given operator, monoid, or
- * semiring unless the #grb::descriptors::no_casting descriptor was passed.
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
*
- * An implementation, whether blocking or non-blocking, should have clear
- * performance semantics for every sequence of graphBLAS calls, no matter
- * whether those are made from sequential or parallel contexts. Backends
- * may define different performance semantics depending on which #grb::Phase
- * primitives execute in.
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
*
- * @{
+ * @see perfSemantics
*/
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const Vector< InputType1, backend, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- [T2] * T3), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAMAS_base = false;
+ assert( should_not_call_eWiseMulAMAS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) x;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
/**
- * A standard vector to use for mask parameters.
+ * In-place element-wise multiplication of two scalars,
+ * \f$ z += \alpha .* \beta \f$, under a given semiring, masked variant.
*
- * Indicates that no mask shall be used.
+ * @tparam descr The descriptor to be used. Optional; the default is
+ * #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to perform the element-wise multiply
+ * with.
+ * @tparam InputType1 The left-hand side input type.
+ * @tparam InputType2 The right-hand side input type.
+ * @tparam OutputType The output vector type.
+ * @tparam MaskType The output mask type.
*
- * \internal Do not use this symbol within backend implementations.
+ * @param[in,out] z The output vector of type \a OutputType.
+ * @param[in] mask The ouput mask of type \a MaskType.
+ * @param[in] alpha The left-hand input scalar of type \a InputType1.
+ * @param[in] beta The right-hand input scalar of type \a InputType2.
+ * @param[in] ring The generalized semiring under which to perform this
+ * element-wise multiplication.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH If \a mask and \a z have different size.
+ * @return #grb::FAILED If \a phase is #grb::EXECUTE, indicates that the
+ * capacity of \a z was insufficient. The output vector
+ * \a z is cleared, and the call to this function has no
+ * further effects.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE, indicates an
+ * out-of-memory exception. The call to this function
+ * shall have no other effects beyond returning this
+ * error code; the previous state of \a z is retained.
+ * @return #grb::PANIC A general unmitigable error has been encountered. If
+ * returned, ALP enters an undefined state and the user
+ * program is encouraged to exit as quickly as possible.
+ *
+ * \warning Unlike #grb::eWiseApply using monoids, given sparse vectors,
+ * missing elements in sparse input vectors are now interpreted as a
+ * the zero identity, therefore annihilating instead of acting as a
+ * monoid identity. Therefore even when \a z is empty on input, the
+ * #grb::eWiseApply with monoids does not incur the same behaviour as
+ * this function. The #grb::eWiseApply with operators \em is similar,
+ * except that this function is in-place and #grb::eWiseApply is not.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * \note Invalid descriptors will be ignored.
+ *
+ * If #grb::descriptors::no_casting is specified, then 1) the first domain of
+ * \a ring must match \a InputType1, 2) the second domain of \a ring must match
+ * \a InputType2, 3) the third domain of \a ring must match \a OutputType. If
+ * one of these is not true, the code shall not compile.
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
- #define NO_MASK Vector< bool >( 0 )
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, enum Backend backend,
+ typename InputType1, typename InputType2,
+ typename OutputType, typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, backend, Coords > &z,
+ const Vector< MaskType, backend, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "in eWiseMul ([T1] <- T2 * T3), masked, base";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_eWiseMulAMSS_base = false;
+ assert( should_not_call_eWiseMulAMSS_base );
+#endif
+ (void) z;
+ (void) mask;
+ (void) alpha;
+ (void) beta;
+ (void) ring;
+ (void) phase;
+ return UNSUPPORTED;
+ }
/**
- * Executes an arbitrary element-wise user-defined function \a f using any
- * number of vectors of equal length, following the nonzero pattern of the
- * given vector \a x.
- *
- * The user-defined function is passed as a lambda which can capture, at
- * the very least, other instances of type grb::Vector. Use of this function
- * is preferable whenever multiple element-wise operations are requested that
- * use one or more identical input vectors. Performing the computation one
- * after the other in blocking mode would require the same vector to be
- * streamed multiple times, while with this function the operations can be
- * fused explicitly instead.
- *
- * It shall always be legal to capture non-GraphBLAS objects for read access
- * only. It shall \em not be legal to capture instances of type grb::Matrix
- * for read and/or write access.
- *
- * If grb::Properties::writableCaptured evaluates true then captured
- * non-GraphBLAS objects can also be written to, not just read from. The
- * captured variable is, however, completely local to the calling user process
- * only-- it will not be synchronised between user processes.
- * As a rule of thumb, data-centric GraphBLAS implementations \em cannot
- * support this and will thus have grb::Properties::writableCaptured evaluate
- * to false. A portable GraphBLAS algorithm should provide a different code
- * path to handle this case.
- * When it is legal to write to captured scalar, this function can, e.g., be
- * used to perform reduction-like operations on any number of equally sized
- * input vectors. This would be preferable to a chained number of calls to
- * grb::dot in case where some vectors are shared between subsequent calls,
- * for example; the shared vectors are streamed only once using this lambda-
- * enabled function.
- *
- * \warning The lambda shall only be executed on the data local to the user
- * process calling this function! This is different from the various
- * fold functions, or grb::dot, in that the semantics of those
- * functions always end with a globally synchronised result. To
- * achieve the same effect with user-defined lambdas, the users
- * should manually prescribe how to combine the local results into
- * global ones, for instance, by a subsequent call to
- * grb::collectives<>::allreduce.
- *
- * \note This is an addition to the GraphBLAS. It is alike user-defined
- * operators, monoids, and semirings, except it allows execution on
- * arbitrarily many inputs and arbitrarily many outputs.
+ * Executes an arbitrary element-wise user-defined function \a f on any number
+ * of vectors of equal length.
+ *
+ * \warning This is a relatively advanced function. It is recommended to read
+ * this specifications and its warnings before using it, or to instead
+ * exclusively only use the other primitives in \ref BLAS1.
+ *
+ * The vectors touched by \a f can be accessed in a read-only or a read/write
+ * fashion. The function \a f must be parametrised in a global index \em i, and
+ * \a f is only allowed to access elements of the captured vectors on that
+ * specific index .
+ *
+ * \warning Any attempt to access a vector element at a position differing
+ * from \em i will result in undefined behaviour.
+ *
+ * All vectors captured by \a f must furthermore all be given as additional
+ * (variadic) arguments to this primitive. Captured vectors can only be used
+ * for dereferencing elements at a given position \em i; any other use invokes
+ * undefined behaviour.
+ *
+ * \warning In particular, captured vectors may not be passed to other
+ * ALP/GraphBLAS primitives \em within \a f.
+ *
+ * This primitive will execute \a f on all indices where the first given such
+ * vector argument has nonzeroes. All other indices \em i will be ignored.
+ *
+ * \warning Therefore, for containers of which \a f references the \em i-th
+ * element, must indeed have a nonzero at position \em i or otherwise
+ * undefined behaviour is invoked.
+ *
+ * This primitive hence allows a user to implement any level-1 like BLAS
+ * functionality over any number of input/output vectors, and also allows to
+ * compute multiple level-1 (like) BLAS functionalities as a single pass over
+ * the involved containers.
+ *
+ * \note Since the introduction of the nonblocking backend, rewriting \a f in
+ * terms of native ALP/GraphBLAS primitives no longer implies performance
+ * penalties (when compiling for the nonblocking backend)-- rather, the
+ * nonblocking backend is likely to do better than manually fusing
+ * multiple level-1 like operations using this primitive, especially when
+ * the captured vectors are small relative to the private caches on the
+ * target architecture.
+ *
+ * The function \a f may also capture scalars for read-only access.
+ *
+ * \note As a convention, consider always passing scalars by value, since
+ * otherwise the compilation of your code with a non-blocking backend
+ * may (likely) result in data races.
+ *
+ * If #grb::Properties::writableCaptured evaluates true then captured
+ * scalars may also safely be written to, instead of requiring to be read-only.
+ *
+ * \note This is useful for fusing reductions within other level-1 like
+ * operations.
+ *
+ * \warning If updating scalars using this primitive, be aware that the
+ * updates are local to the current user process only.
+ *
+ * \note If, after execution of this primitive, an updated scalar is expected
+ * to be synchronised across all user processes, see #grb::collectives.
+ *
+ * \note As a rule of thumb, parallel GraphBLAS implementations, due to being
+ * data-centric, \em cannot support writeable scalar captures and will
+ * have #grb::Properties::writableCaptured evaluate to false .
+ *
+ * \note A portable ALP/GraphBLAS algorithm should therefore either not rely on
+ * read/write captured scalars passed to this primitive, \em or provide
+ * different code paths to handle the two cases of the
+ * #grb::Properties::writableCaptured backend property.
+ *
+ * \note If the above sounds too tedious, consider rewriting \a f in terms of
+ * native ALP/GraphBLAS functions, with the scalar reductions performed by
+ * the scalar variants of #grb::foldl and #grb::foldr, e.g.
+ *
+ * \warning When compiling with a blocking backend, rewriting \a f in terms of
+ * native GraphBLAS primitives typically results in a slowdown due to
+ * this primitive naturally fusing potentially multiple operations
+ * together (which was the original motivation of Yzelman et al., 2020
+ * for introducing this primitive. Rewriting \a f into a (sequence of)
+ * native GraphBLAS primtives does \em not carry a performance when
+ * compiling with a nonblocking backend, however.
+ *
+ * \note This is an addition to the GraphBLAS C specification. It is alike
+ * user-defined operators, monoids, and semirings, except that this
+ * primitive allows execution on arbitrarily many inputs and arbitrarily
+ * many outputs.
*
* @tparam Func the user-defined lambda function type.
* @tparam DataType the type of the user-supplied vector example.
@@ -200,46 +3617,46 @@ namespace grb {
* @param[in] f The user-supplied lambda. This lambda should only capture
* and reference vectors of the same length as \a x. The lambda
* function should prescribe the operations required to execute
- * at a given index \a i. Captured GraphBLAS vectors can access
- * that element via the operator[]. It is illegal to access any
- * element not at position \a i. The lambda takes only the single
- * parameter \a i of type const size_t
. Captured
- * scalars will not be globally updated-- the user must program
- * this explicitly. Scalars and other non-GraphBLAS containers
- * are always local to their user process.
+ * at a given index \a i. Captured ALP/GraphBLAS vectors can
+ * access that element via the operator[]. It is illegal to access
+ * any element not at position \a i. The lambda takes only the
+ * single parameter \a i of type const size_t
.
+ * Captured scalars will not be globally updated-- the user must
+ * program this explicitly. Scalars and other non-GraphBLAS
+ * containers are always local to their user process.
* @param[in] x The vector the lambda will be executed on. This argument
* determines which indices \a i will be accessed during the
* elementwise operation-- elements with indices \a i that
* do not appear in \a x will be skipped during evaluation of
* \a f.
- * @param[in] args All vectors the lambda is to access elements of. Must be of
- * the same length as \a x. If this constraint is violated,
- * grb::MISMATCH shall be returned. This is a variadic
- * argument and can contain any number of containers of type
- * grb::Vector, passed as though they were separate
- * arguments.
- *
- * \note In future GraphBLAS implementations, \a args, apart from doing
- * dimension checking, should also facilitate any data distribution
- * necessary to successfully execute the element-wise operation. Current
- * implementations do not require this since they use the same static
- * distribution for all containers.
- *
- * \warning Using a grb::Vector inside a lambda passed to this function while
- * not passing that same vector into \a args, will result in undefined
- * behaviour.
- *
- * \note It would be natural to have \a x equal to one of the captured
- * GraphBLAS vectors in \a f.
+ *
+ * The remaining arguments must collect all vectors the lambda is to access
+ * elements of. Such vectors must be of the same length as \a x. If this
+ * constraint is violated, #grb::MISMATCH shall be returned.
+ *
+ * \note These are passed using variadic arguments and so can contain any
+ * number of containers of type #grb::Vector.
+ *
+ * \note Distributed-memory ALP/GraphBLAS backends, apart from performing
+ * dimension checking, may also require data redistribution in case that
+ * different vectors are distributed differently.
+ *
+ * \warning Using a #grb::Vector inside a lambda passed to this function while
+ * not passing that same vector into its variadic argument list, will
+ * result in undefined behaviour.
*
* \warning Due to the constraints on \a f described above, it is illegal to
* capture some vector \a y and have the following line in the body
* of \a f: x[i] += x[i+1]
. Vectors can only be
* dereferenced at position \a i and \a i alone.
*
- * @return grb::SUCCESS When the lambda is successfully executed.
- * @return grb::MISMATCH When two or more vectors passed to \a args are not of
- * equal length.
+ * @return #grb::SUCCESS When the lambda is successfully executed.
+ * @return #grb::MISMATCH When two or more vectors passed to \a args are not of
+ * equal length.
+ * @return #grb::PANIC When ALP/GraphBLAS has encountered an unrecoverable
+ * error. The state of ALP becomes undefined after
+ * having returned this error code, and users can only
+ * attempt to exit the application gracefully.
*
* \parblock
* \par Example.
@@ -285,9 +3702,11 @@ namespace grb {
* grb::dot( alpha, x, y, ring );
* \endcode
*
- * The version using the lambdas, however, is expected to execute
- * faster as both \a x and \a y are streamed only once, while the
- * latter code may stream both vectors twice.
+ * If the latter code block is compiled using a blocking ALP/GraphBLAS backend,
+ * the version using the lambdas is expected to execute faster as both \a x and
+ * \a y are streamed only once, while the latter code may stream both vectors
+ * twice. This performance difference disappears when compiling the latter code
+ * block using a nonblocking backend instead.
* \endparblock
*
* \warning The following code is invalid:
@@ -306,17 +3725,16 @@ namespace grb {
* Only a Vector::lambda_reference to position exactly equal to \a i
* may be used within this function.
*
- * \warning There is no similar concept in the official GraphBLAS specs.
- *
- * \warning Captured scalars will be local to the user process executing the
- * lambda. To retrieve the global dot product, an allreduce must
- * explicitly be called.
- *
* @see Vector::operator[]()
* @see Vector::lambda_reference
*
- * \todo Revise specification regarding recent changes on phases, performance
- * semantics, and capacities.
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive. It is
+ * expected that the defined performance semantics depend on the given lambda
+ * function \a f, the size of the containers passed into this primitive, as
+ * well as how many containers are passed into this primitive.
+ *
+ * @see perfSemantics
*/
template<
typename Func,
@@ -333,8 +3751,8 @@ namespace grb {
const bool should_not_call_base_vector_ewiselambda = false;
assert( should_not_call_base_vector_ewiselambda );
#endif
- (void)f;
- (void)x;
+ (void) f;
+ (void) x;
return UNSUPPORTED;
}
@@ -375,9 +3793,9 @@ namespace grb {
* @tparam Monoid The monoid to use for reduction.
* @tparam InputType The type of the elements in the supplied ALP/GraphBLAS
* vector \a y.
+ * @tparam IOType The type of the output scalar \a x.
* @tparam MaskType The type of the elements in the supplied ALP/GraphBLAS
* vector \a mask.
- * @tparam IOType The type of the output scalar \a x.
*
* @param[out] x The result of the reduction.
* @param[in] y Any ALP/GraphBLAS vector. This vector may be sparse.
@@ -390,6 +3808,9 @@ namespace grb {
* @return grb::ILLEGAL If the provided input vector \a y was not dense, while
* #grb::descriptors::dense was given.
*
+ * @see grb::foldr provides similar in-place functionality.
+ * @see grb::eWiseApply provides out-of-place semantics.
+ *
* \parblock
* \par Valid descriptors
* grb::descriptors::no_operation, grb::descriptors::no_casting,
@@ -405,18 +3826,10 @@ namespace grb {
* shall not compile.
* \endparblock
*
- * \parblock
* \par Performance semantics
- * Backends must specify performance semantics in the amount of work, intra-
- * process data movement, inter-process data movement, and the number of
- * user process synchronisations required. They should also specify whether
- * any system calls may be made, in particularly those related to dynamic
- * memory management. If new memory may be allocated, they must specify how
- * much.
- * \endparblock
+ * Each backend must define performance semantics for this primitive.
*
- * @see grb::foldr provides similar in-place functionality.
- * @see grb::eWiseApply provides out-of-place semantics.
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -480,7 +3893,7 @@ namespace grb {
/**
* Folds a vector into a scalar, left-to-right.
*
- * Unmasked operator variant.
+ * Unmasked operator variant. See masked variant for the full documentation.
*
* \deprecated This signature is deprecated. It was implemented for reference
* (and reference_omp), but could not be implemented for BSP1D and
@@ -581,18 +3994,147 @@ namespace grb {
}
/**
- * Dot product over a given semiring.
+ * Calculates the dot product, \f$ z += (x,y) \f$, under a given additive
+ * monoid and multiplicative operator.
+ *
+ * @tparam descr The descriptor to be used. Optional; the default
+ * descriptors is #grb::descriptors::no_operation.
+ * @tparam AddMonoid The monoid used for addition during the computation of
+ * \f$ (x,y) \f$. The same monoid is used for accumulating
+ * the result into a given scalar.
+ * @tparam AnyOp A binary operator that acts as the multiplication during
+ * \f$ (x,y) \f$.
+ * @tparam OutputType The output type.
+ * @tparam InputType1 The input element type of the left-hand input vector.
+ * @tparam InputType2 The input element type of the right-hand input vector.
+ *
+ * @param[in,out] z Where to fold \f$ (x,y) \f$ into.
+ * @param[in] x The left-hand input vector.
+ * @param[in] y The right-hand input vector.
+ * @param[in] addMonoid The additive monoid under which the reduction of the
+ * results of element-wise multiplications of \a x and
+ * \a y are performed.
+ * @param[in] anyOp The multiplicative operator using which element-wise
+ * multiplications of \a x and \a y are performed. This
+ * may be any binary operator.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * \note By this primitive by which a dot-product operates under any additive
+ * monoid and any binary operator, it follows that a dot product under
+ * any semiring can be reduced to a call to this primitive instead.
+ *
+ * @return #grb::MISMATCH When the dimensions of \a x and \a y do not match.
+ * All input data containers are left untouched if this
+ * exit code is returned; it will be as though this call
+ * was never made.
+ * @return #grb::SUCCESS On successful completion of this call.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * -# grb::descriptors::no_operation
+ * -# grb::descriptors::no_casting
+ * -# grb::descriptors::dense
+ *
+ * If the dense descriptor is set, this implementation returns grb::ILLEGAL if
+ * it was detected that either \a x or \a y was sparse. In this case, it shall
+ * otherwise be as though the call to this function had not occurred (no side
+ * effects).
+ * \endparblock
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AddMonoid, class AnyOp,
+ typename OutputType, typename InputType1, typename InputType2,
+ enum Backend backend, typename Coords
+ >
+ RC dot(
+ OutputType &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
+ const AddMonoid &addMonoid = AddMonoid(),
+ const AnyOp &anyOp = AnyOp(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< AddMonoid >::value &&
+ grb::is_operator< AnyOp >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "Should not call base grb::dot (monoid-operator version)\n";
+#endif
+#ifndef NDEBUG
+ const bool should_not_call_base_dot_monOp = false;
+ assert( should_not_call_base_dot_monOp );
+#endif
+ (void) z;
+ (void) x;
+ (void) y;
+ (void) addMonoid;
+ (void) anyOp;
+ (void) phase;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Calculates the dot product, \f$ z += (x,y) \f$, under a given semiring.
+ *
+ * @tparam descr The descriptor to be used. Optional; default descriptor
+ * is #grb::descriptors::no_operation.
+ * @tparam Ring The semiring type to use.
+ * @tparam OutputType The output type.
+ * @tparam InputType1 The input element type of the left-hand input vector.
+ * @tparam InputType2 The input element type of the right-hand input vector.
+ *
+ * @param[in,out] z The output element \f$ z += (x,y) \f$.
+ * @param[in] x The left-hand input vector \a x.
+ * @param[in] y The right-hand input vector \a y.
+ * @param[in] ring The semiring under which to compute the dot product
+ * \f$ (x,y) \f$. The additive monoid is used to accumulate
+ * the dot product result into \a z.
+ * @param[in] phase The #grb::Phase the call should execute. Optional; the
+ * default parameter is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS On successful completion of this call.
+ * @return #grb::MISMATCH If the dimensions of \a x and \a y do not match. All
+ * input data containers are left untouched if this exit
+ * code is returned; it will be as though this call was
+ * never made.
+ *
+ * \parblock
+ * \par Valid descriptors
+ * - grb::descriptors::no_operation
+ * - grb::descriptors::no_casting
+ * - grb::descriptors::dense
+ *
+ * If the dense descriptor is set, this implementation returns #grb::ILLEGAL if
+ * it was detected that either \a x or \a y was sparse. In this case, it shall
+ * otherwise be as though the call to this function had not occurred (no side
+ * effects).
+ * \endparblock
*
- * \todo Write specification.
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
template<
- Descriptor descr = descriptors::no_operation, class Ring,
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
typename IOType, typename InputType1, typename InputType2,
Backend backend, typename Coords
>
- RC dot( IOType &x,
- const Vector< InputType1, backend, Coords > &left,
- const Vector< InputType2, backend, Coords > &right,
+ RC dot(
+ IOType &z,
+ const Vector< InputType1, backend, Coords > &x,
+ const Vector< InputType2, backend, Coords > &y,
const Ring &ring = Ring(),
const Phase &phase = EXECUTE,
const typename std::enable_if<
@@ -609,9 +4151,9 @@ namespace grb {
const bool should_not_call_base_dot_semiring = false;
assert( should_not_call_base_dot_semiring );
#endif
+ (void) z;
(void) x;
- (void) left;
- (void) right;
+ (void) y;
(void) ring;
(void) phase;
return UNSUPPORTED;
diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp
index 6b1bccf55..7f99122c5 100644
--- a/include/graphblas/base/blas2.hpp
+++ b/include/graphblas/base/blas2.hpp
@@ -18,7 +18,7 @@
/**
* @file
*
- * Defines the GraphBLAS level 2 API.
+ * Defines the ALP/GraphBLAS level-2 API
*
* @author A. N. Yzelman
* @date 30th of March 2017
@@ -39,10 +39,12 @@
#include "matrix.hpp"
#include "vector.hpp"
+
namespace grb {
/**
- * \defgroup BLAS2 The Level-2 Basic Linear Algebra Subroutines (BLAS)
+ * \defgroup BLAS2 Level-2 Primitives
+ * \ingroup GraphBLAS
*
* A collection of functions that allow GraphBLAS operators, monoids, and
* semirings work on a mix of zero-dimensional, one-dimensional, and
@@ -57,181 +59,414 @@ namespace grb {
*/
/**
- * Right-handed sparse matrix times vector multiplication, \f$ u = Av \f$.
- *
- * Let \f$ u \f$ and \f$ \mathit{mask} \f$ each be a #grb::Vector of #grb::size
- * \f$ m \f$, \f$ v \f$ be a #grb::Vector of #grb::size \f$ n \f$, and let
- * \f$ A \f$ be a #Matrix with #grb::nrows \f$ m \f$ and #grb::ncols \f$ n \f$.
- * Let furthermore \f$ z \f$ be an interal vector of size \f$ m \f$.
- * A call to this function first computes \f$ z = Av \f$ over the provided
- * \a ring. It then left-folds \f$ z \f$ into \f$ u \f$ using the provided
- * \a accumulator.
- *
- * @see Vector for an in-depth description of a GraphBLAS vector.
- * @see size for retrieving the length of a given GraphBLAS vector.
- * @see Matrix for an in-depth description of a GraphBLAS matrix.
- * @see nrows for retrieving the number of rows of a given GraphBLAS matrix.
- * @see ncols for retrieving the number of columns of a given GraphBLAS
- * vector.
- *
- * Formally, the exact operation executed is
- * \f$ u_i^\mathit{out} = u_i^\mathit{in} \bigodot z_i, \f$
- * for all \f$ i \in \{ 0, 1, \ldots, m-1 \} \f$ for which
- * \f$ \mathit{mask}_i \f$ evaluates true . If there is a nonzero at
- * \f$ z_i \f$ but no nonzero at \f$ u_i^\mathit{in} \f$ then the latter is interpreted as the additive
- * identity \f$ \mathbf{0} \f$ of the given \a ring.
- * For \f$ z \f$, we formally have:
- * \f$ z_i = \bigoplus{i=0}^{m-1} \left( A_{ij} \bigotimes v_j \right), \f$
- * where \f$ \bigodot \f$ represents the \a accumulator, \f$ \bigoplus \f$
- * represents the additive operator of the provided \a ring, and
- * \f$ \bigotimes \f$ represents the multiplicative operator of \a ring. If here
- * \f$ v_j \f$ does not exist, it is considered to be equal to the additive
- * identity of the given \a ring.
- *
- * \note The additive identity of a given \a ring is an annihilator of
- * nonzeroes from \f$ A \f$ under the multiplicative operator of \a ring;
- * that is, \f$ z_i \f$ will be \f$ \mathbf{0} \f$ always. This can, of
- * course, be exploited during sparse matrix--sparse vector (SpMSpV)
- * multiplication.
- *
- * \note A good implementation is very careful about forming \f$ z \f$
- * explicitly and, even if it is formed already, is very careful about
- * making use of \f$ z \f$. Making use of an explicit buffer will result
- * in \f$ \Theta(m) \f$ data movement and may only be warrented when
- * \f$ A \f$ has many nonzeroes per row and \f$ v \f$ is dense.
- *
- * @tparam descr Any combination of one or more #grb::descriptors. When
- * ommitted, the default #grb::descriptors:no_operation will
- * be assumed.
- * @tparam Ring The generalised semi-ring the matrix--vector multiplication
- * is to be executed under.
- * @tparam IOType The type of the elements of the output vector \a u.
+ * Right-handed in-place doubly-masked sparse matrix times vector
+ * multiplication, \f$ u = u + Av \f$.
+ *
+ * Aliases to this function exist that do not include masks:
+ * - grb::mxv( u, u_mask, A, v, semiring );
+ * - grb::mxv( u, A, v, semiring );
+ * When masks are omitted, the semantics shall be the same as though a dense
+ * Boolean vector of the appropriate size with all elements set to
+ * true was given as a mask. We thus describe the semantics of the
+ * fully masked variant only.
+ *
+ * \note If only an input mask \a v_mask is intended to be given (and no output
+ * mask \a u_mask), then \a u_mask must nonetheless be explicitly given.
+ * Passing an empty Boolean vector for \a u_mask is sufficient.
+ *
+ * Let \f$ u, \mathit{u\_mask} \f$ be vectors of size \f$ m \f$, let
+ * \f$ v, \mathit{v\_mask} \f$ be vectors of size \f$ n \f$, and let
+ * \f$ A \f$ be an \f$ m \times n \f$ matrix. Then, a call to this function
+ * computes \f$ u = u + Av \f$ but:
+ * 1. only for the elements \f$ u_i \f$ for which \f$ \mathit{u\_mask}_i \f$
+ * evaluates true ; and
+ * 2. only considering the elements \f$ v_j \f$ for which
+ * \f$ \mathit{v\_mask}_v \f$ evaluates true , and otherwise
+ * substituting the zero element under the given semiring.
+ *
+ * When multiplying a matrix nonzero element \f$ a_{ij} \in A \f$, it shall
+ * be multiplied with an element \f$ x_j \f$ using the multiplicative operator
+ * of the given \a semiring.
+ *
+ * When accumulating multiple contributions of multiplications of nonzeroes on
+ * some row \f$ i \f$, the additive operator of the given \a semiring shall be
+ * used.
+ *
+ * Nonzero resulting from computing \f$ Av \f$ are accumulated into any pre-
+ * existing values in \f$ u \f$ by the additive operator of the given
+ * \a semiring.
+ *
+ * If elements from \f$ v \f$, \f$ A \f$, or \f$ u \f$ were missing, the zero
+ * identity of the given \a semiring is substituted.
+ *
+ * If nonzero values from \f$ A \f$ were missing, the one identity of the given
+ * semiring is substituted.
+ *
+ * \note A nonzero in \f$ A \f$ may not have a nonzero value in case it is
+ * declared as grb::Matrix< void > .
+ *
+ * The following template arguments \em may be explicitly given:
+ *
+ * @tparam descr Any combination of one or more #grb::descriptors. When
+ * ommitted, the default #grb::descriptors:no_operation will
+ * be assumed.
+ * @tparam Semiring The generalised semiring the matrix--vector
+ * multiplication is to be executed under.
+ *
+ * The following template arguments will be inferred from the input arguments:
+ *
+ * @tparam IOType The type of the elements of the output vector \a u.
* @tparam InputType1 The type of the elements of the input vector \a v.
* @tparam InputType2 The type of the elements of the input matrix \a A.
- * @tparam Operator The type of the \a accumulator. Must be a GraphBLAS
- * operator; see also #grb::operators.
- * @tparam InputType3 The type of the elements of the mask vector \a mask.
- * @tparam implementation Which back-end the given vectors and matrices belong
- * to. These must all belong to the same back-end.
- *
- * @param[in,out] u The output vector. Depending on the provided
- * \a accumulator, old vector values may affect new values.
- * @param[in] mask The mask vector. The vector #grb::size must be equal to
- * that of \a u, \em or it must be equal to zero. A \a mask
- * of grb::size zero will be ignored (assumed true
- * always.
- * @param[in] accumulator The operator \f$ \bigodot \f$ in the above
- * description.
- * @param[in] A The input matrix. Its #grb::nrows must equal the
- * #grb::size of \a u.
- * @param[in] v The input vector. Its #grb::size must equal the
- * #grb::ncols of \a A.
- * @param[in] ring The semiring to perform the matrix--vector multiplication
- * under. Unless #grb::descriptors::no_casting is defined,
- * elements from \a u, \a A, and \a v will be cast to the
- * domains of the additive and multiplicative operators of
- * \a ring as they are applied during the multiplication.
- *
- * \warning Even if #grb::operators::right_assign is provided as accumulator,
- * old values of \a u may \em not be overwritten if the computation
- * ends up not writing any new values to those values. To throw away
- * old vector values use grb::descriptors::explicit_zero (for dense
- * vectors only if you wish to retain sparsity of the output vector),
- * or first simply use grb::clear on \a u.
+ * @tparam InputType3 The type of the output mask (\a u_mask) elements.
+ * @tparam InputType4 The type of the input mask (\a v_mask) elements.
+ *
+ * \internal
+ * The following template arguments will be inferred from the input arguments
+ * and generally do not concern end-users:
+ *
+ * @tparam Coords Which coordinate class is used to maintain sparsity
+ * structures.
+ * @tparam RIT The integer type used for row indices.
+ * @tparam CIT The integer type used for column indices.
+ * @tparam NIT The integer type used for nonzero indices.
+ * @tparam backend The backend implementing the SpMV multiplication. The input
+ * containers must all refer to the same backend.
+ * \endinternal
+ *
+ * The following arguments are mandatory:
+ *
+ * @param[in,out] u The output vector.
+ * @param[in] A The input matrix. Its #grb::nrows must equal the
+ * #grb::size of \a u.
+ * @param[in] v The input vector. Its #grb::size must equal the
+ * #grb::ncols of \a A.
+ * @param[in] semiring The semiring to perform the matrix--vector
+ * multiplication under. Unless
+ * #grb::descriptors::no_casting is defined, elements from
+ * \a u, \a A, and \a v will be cast to the domains of the
+ * additive and multiplicative operators of \a semiring.
+ *
+ * The vector \a v may not be the same as \a u.
+ *
+ * Instead of passing a \a semiring, users may opt to provide an additive
+ * commutative monoid and a binary multiplicative operator instead. In this
+ * case, \a A may not be a pattern matrix (that is, it must not be of type
+ * grb::Matrix< void > ).
+ *
+ * The \a semiring (or the commutative monoid - binary operator pair) is
+ * optional if they are passed as a template argument instead.
+ *
+ * \note When providing a commutative monoid - binary operator pair, ALP
+ * backends are precluded from employing distributative laws in
+ * generating optimised codes.
+ *
+ * Non-mandatory arguments are:
+ *
+ * @param[in] u_mask The output mask. The vector must be of equal size as \a u,
+ * \em or it must be empty (have size zero).
+ * @param[in] v_mask The input mask. The vector must be of equal size as \a v,
+ * \em or it must be empty (have size zero).
+ * @param[in] phase The requested phase for this primitive-- see
+ * #grb::Phase for details.
+ *
+ * The vectors \a u_mask and \a v_mask may never be the same as \a u.
+ *
+ * An empty \a u_mask will behave semantically the same as providing no mask;
+ * i.e., as a mask that evaluates true at every position.
+ *
+ * If \a phase is not given, it will be set to the default #grb::EXECUTE.
+ *
+ * If \a phase is #grb::EXECUTE, then the capacity of \a u must be greater than
+ * or equal to the capacity required to hold all output elements of the
+ * requested computation.
*
* The above semantics may be changed by the following descriptors:
- * * #descriptors::invert_mask: \f$ u_i^\mathit{out} \f$ will be written to
- * if and only if \f$ \mathit{mask}_i \f$ evaluates false .
- * * #descriptors::transpose_matrix: \f$ A \f$ is interpreted as \f$ A^T \f$
+ * - #descriptors::transpose_matrix: \f$ A \f$ is interpreted as \f$ A^T \f$
* instead.
- * * #descriptors::structural: when evaluating \f$ \mathit{mask}_i \f$, only
- * the structure of \f$ \mathit{mask} \f$ is considered (as opposed to its
- * elements); if \f$ \mathit{mask} \f$ has a nonzero at its \f$ i \f$th
- * index, it is considered to evaluate true no matter what the
- * actual value of \f$ \mathit{mask}_i \f$ was.
- * * #descriptors::structural_complement: a combination of two descriptors:
- * #descriptors::structural and #descriptors::invert_mask (and thus
- * equivalent to structural | invert_mask ). Its net effect is if
- * \f$ \mathit{mask} \f$ does \em not have a nonzero at the \f$ i \f$th
- * index, the mask is considered to evaluate true .
- * * #descriptors::add_identity: the matrix \f$ A \f$ is instead interpreted
- * as \f$ A + \mathbf{1} \f$, where \f$ \mathbf{1} \f$ is the
- * multiplicative identity of the given ring.
- * * #descriptors::use_index: when referencing \f$ v_i \f$, if assigned, then
- * instead of using the value itself, its index \f$ i \f$ is used instead.
- * * #descriptors::in_place: the \a accumulator is ignored; the additive
- * operator of the given \a ring is used in its place. Under certain
- * conditions, an implementation can exploit this semantic to active
- * faster computations.
- * * #descriptors::explicit_zero: if \f$ \mathbf{0} \f$ would be assigned to
- * a previously unassigned index, assign \f$ \mathbf{0} \f$ explicitly to
- * that index. Here, \f$ \mathbf{0} \f$ is the additive identity of the
- * provided \a ring.
- *
- * \parblock
+ * - #descriptors::add_identity: the matrix \f$ A \f$ is instead interpreted
+ * as \f$ A + \mathbf{1} \f$, where \f$ \mathbf{1} \f$ is the one identity
+ * (i.e., multiplicative identity) of the given \a semiring.
+ * - #descriptors::invert_mask: \f$ u_i \f$ will be written to if and only if
+ * \f$ \mathit{u\_mask}_i \f$ evaluates false , and \f$ v_j \f$
+ * will be read from if and only if \f$ \mathit{v\_mask}_j \f$ evaluates
+ * false .
+ * - #descriptors::structural: when evaluating \f$ \mathit{mask}_i \f$, only
+ * the structure of \f$ \mathit{u\_mask}, \mathit{v\_mask} \f$ is
+ * considered, as opposed to considering their values.
+ * - #descriptors::structural_complement: a combination of two descriptors:
+ * #descriptors::structural and #descriptors::invert_mask.
+ * - #descriptors::use_index: when reading \f$ v_i \f$, then, if there is
+ * indeed a nonzero \f$ v_i \f$, use the value \f$ i \f$ instead. This
+ * casts the index from size_t to the \a InputType1 of \a v.
+ * - #descriptors::explicit_zero: if \f$ u_i \f$ was unassigned on entry and
+ * if \f$ (Av)_i \f$ is \f$ \mathbf{0} \f$, then instead of leaving
+ * \f$ u_i \f$ unassigned, it is set to \f$ \mathbf{0} \f$ explicitly.
+ * Here, \f$ \mathbf{0} \f$ is the additive identity of the provided
+ * \a semiring.
+ * - #descriptors::safe_overlap: the vectors \a u and \a v may now be the
+ * same container. The user guarantees that no race conditions exist during
+ * the requested computation, however. The user may guarantee this due to a
+ * a very specific structure of \a A and \a v, or via an intelligently
+ * constructed \a u_mask, for example.
+ *
+ * @returns #grb::SUCCESS If the computation completed successfully.
+ * @returns #grb::MISMATCH If there is at least one mismatch between vector
+ * dimensions or between vectors and the given matrix.
+ * @returns #grb::OVERLAP If two or more provided vectors refer to the same
+ * container while this was not allowed.
+ *
+ * When any of the above non-SUCCESS error code is returned, it shall be as
+ * though the call was never made-- the state of all container arguments and
+ * of the application remain unchanged, save for the returned error code.
+ *
+ * @returns grb::PANIC Indicates that the application has entered an undefined
+ * state.
+ *
+ * \note Should this error code be returned, the only sensible thing to do is
+ * exit the application as soon as possible, while refraining from using
+ * any other ALP pritimives.
+ *
* \par Performance semantics
- * Performance semantics vary depending on whether a mask was provided, and on
- * whether the input vector is sparse or dense. If the input vector \f$ v \f$
- * is sparse, let \f$ J \f$ be its set of assigned indices. If a non-trivial
- * mask \f$ \mathit{mask} \f$ is given, let \f$ I \f$ be the set of indices for
- * which the corresponding \f$ \mathit{mask}_i \f$ evaluate true . Then:
- * -# For the performance guarantee on the amount of work this function
- * entails the following table applies:
- * \f$ \begin{tabular}{cccc}
- * Masked & Dense input & Sparse input \\
- * \noalign{\smallskip}
- * no & $\Theta(2\mathit{nnz}(A))$ & $\Theta(2\mathit{nnz}(A_{:,J}))$ \\
- * yes & $\Theta(2\mathit{nnz}(A_{I,:})$ & $\Theta(\min\{2\mathit{nnz}(A_{I,:}),2\mathit{nnz}(A_{:,J})\})$
- * \end{tabular}. \f$
- * -# For the amount of data movements, the following table applies:
- * \f$ \begin{tabular}{cccc}
- * Masked & Dense input & Sparse input \\
- * \noalign{\smallskip}
- * no & $\Theta(\mathit{nnz}(A)+\min\{m,n\}+m+n)$ & $\Theta(\mathit{nnz}(A_{:,J}+\min\{m,2|J|\}+|J|)+\mathcal{O}(2m)$ \\
- * yes & $\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n)$ &
- * $\Theta(\min\{\Theta(\mathit{nnz}(A_{I,:})+\min\{|I|,n\}+2|I|)+\mathcal{O}(n),\mathit{nnz}(A_{:,J}+\min\{m,|J|\}+2|J|)+\mathcal{O}(2m))$ \end{tabular}. \f$
- * -# A call to this function under no circumstance will allocate nor free
- * dynamic memory.
- * -# A call to this function under no circumstance will make system calls.
- * The above performance bounds may be changed by the following desciptors:
- * * #descriptors::invert_mask: replaces \f$ \Theta(|I|) \f$ data movement
- * costs with a \f$ \mathcal{O}(2m) \f$ cost instead, or a
- * \f$ \mathcal{O}(m) \f$ cost if #descriptors::structural was defined as
- * well (see below). In other words, implementations are not required to
- * implement inverted operations efficiently (\f$ 2\Theta(m-|I|) \f$ data
- * movements would be optimal but costs another \f$ \Theta(m) \f$ memory
- * to maintain).
- * * #descriptors::structural: removes \f$ \Theta(|I|) \f$ data movement
- * costs as the mask values need no longer be touched.
- * * #descriptors::add_identity: adds, at most, the costs of grb::foldl
- * (on vectors) to all performance metrics.
- * * #descriptors::use_index: removes \f$ \Theta(n) \f$ or
- * \f$ \Theta(|J|) \f$ data movement costs as the input vector values need
- * no longer be touched.
- * * #descriptors::in_place (see also above): turns \f$ \mathcal{O}(2m) \f$
- * data movements into \f$ \mathcal{O}(m) \f$ instead; i.e., it halves the
- * amount of data movements for writing the output.
- * * #descriptors::dense: the input, output, and mask vectors are assumed to
- * be dense. This allows the implementation to skip checks or other code
- * blocks related to handling of sparse vectors. This may result in use of
- * unitialised memory if any of the provided vectors were, in fact,
- * sparse.
- * Implementations that support multiple user processes must characterise data
- * movement between then.
- * \endparblock
- *
- * @returns grb::SUCCESS If the computation completed successfully.
- * @returns grb::MISMATCH If there is at least one mismatch between vector
- * dimensions or between vectors and the given matrix.
- * @returns grb::OVERLAP If two or more provided vectors refer to the same
- * vector.
- *
- * When a non-SUCCESS error code is returned, it shall be as though the call
- * was never made. Note that all GraphBLAS functions may additionally return
- * #grb::PANIC, which indicates the library has entered an undefined state; if
- * this error code is returned, the only sensible thing a user can do is exit,
- * or at least refrain from using any GraphBLAS functions for the remainder of
- * the application.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Semiring,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4,
+ typename Coords, typename RIT, typename CIT, typename NIT,
+ Backend backend
+ >
+ RC mxv(
+ Vector< IOType, backend, Coords > &u,
+ const Vector< InputType3, backend, Coords > &u_mask,
+ const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
+ const Vector< InputType1, backend, Coords > &v,
+ const Vector< InputType4, backend, Coords > &v_mask,
+ const Semiring &semiring = Semiring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Semiring >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "Selected backend does not implement mxv "
+ << "(doubly-masked, semiring)\n";
+#endif
+#ifndef NDEBUG
+ const bool selected_backed_does_not_support_doubly_masked_mxv_sr = false;
+ assert( selected_backed_does_not_support_doubly_masked_mxv_sr );
+#endif
+ (void) u;
+ (void) u_mask;
+ (void) A;
+ (void) v;
+ (void) v_mask;
+ (void) semiring;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Left-handed in-place doubly-masked sparse matrix times vector
+ * multiplication, \f$ u = u + vA \f$.
+ *
+ * A call to this function is exactly equivalent to calling
+ * - grb::vxm( u, u_mask, A, v, v_mask, semiring, phase )
+ * with the #descriptors::transpose_matrix flipped.
+ *
+ * See the documentation of #grb::mxv for the full semantics of this function.
+ * Like with #grb::mxv, aliases to this function exist that do not include
+ * masks:
+ * - grb::vxm( u, u_mask, v, A, semiring, phase );
+ * - grb::vxm( u, v, A, semiring, phase );
+ *
+ * Similarly, aliases to this function exist that take an additive commutative
+ * monoid and a multiplicative binary operator instead of a semiring.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Semiring,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4,
+ typename Coords, typename RIT, typename CIT, typename NIT,
+ enum Backend backend
+ >
+ RC vxm(
+ Vector< IOType, backend, Coords > &u,
+ const Vector< InputType3, backend, Coords > &u_mask,
+ const Vector< InputType1, backend, Coords > &v,
+ const Vector< InputType4, backend, Coords > &v_mask,
+ const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
+ const Semiring &semiring = Semiring(),
+ const Phase &phase = EXECUTE,
+ typename std::enable_if<
+ grb::is_semiring< Semiring >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ !grb::is_object< IOType >::value,
+ void >::type * = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "Selected backend does not implement doubly-masked grb::vxm\n";
+#endif
+#ifndef NDEBUG
+ const bool selected_backend_does_not_support_doubly_masked_vxm_sr = false;
+ assert( selected_backend_does_not_support_doubly_masked_vxm_sr );
+#endif
+ (void) u;
+ (void) u_mask;
+ (void) v;
+ (void) v_mask;
+ (void) A;
+ (void) semiring;
+ return UNSUPPORTED;
+ }
+
+ /**
+ * Executes an arbitrary element-wise user-defined function \a f on all
+ * nonzero elements of a given matrix \a A.
+ *
+ * The user-defined function is passed as a lambda which can capture whatever
+ * the user would like, including one or multiple grb::Vector instances, or
+ * multiple scalars. When capturing vectors, these should also be passed as a
+ * additional arguments to this functions so to make sure those vectors are
+ * synchronised for access on all row- and column- indices corresponding to
+ * locally stored nonzeroes of \a A.
+ *
+ * Only the elements of a single matrix may be iterated upon.
+ *
+ * \note Rationale: while it is reasonable to expect an implementation be able
+ * to synchronise vector elements, it may be unreasonable to expect two
+ * different matrices can be jointly accessed via arbitrary lambda
+ * functions.
+ *
+ * \warning The lambda shall only be executed on the data local to the user
+ * process calling this function! This is different from the various
+ * fold functions, or grb::dot, in that the semantics of those
+ * functions always result in globally synchronised result. To
+ * achieve the same effect with user-defined lambdas, the users
+ * should manually prescribe how to combine the local results into
+ * global ones, for instance, by subsequent calls to
+ * grb::collectives.
+ *
+ * \note This is an addition to the GraphBLAS. It is alike user-defined
+ * operators, monoids, and semirings, except it allows execution on
+ * arbitrarily many inputs and arbitrarily many outputs.
+ *
+ * @tparam Func the user-defined lambda function type.
+ * @tparam DataType the type of the user-supplied matrix.
+ * @tparam backend the backend type of the user-supplied vector example.
+ *
+ * @param[in] f The user-supplied lambda. This lambda should only capture
+ * and reference vectors of the same length as either the row or
+ * column dimension length of \a A. The lambda function should
+ * prescribe the operations required to execute on a given
+ * reference to a matrix nonzero of \a A (of type \a DataType) at
+ * a given index \f$ (i,j) \f$. Captured GraphBLAS vectors can
+ * access corresponding elements via Vector::operator[] or
+ * Vector::operator(). It is illegal to access any element not at
+ * position \a i if the vector length is equal to the row
+ * dimension. It is illegal to access any element not at position
+ * \a j if the vector length is equal to the column dimension.
+ * Vectors of length neither equal to the column or row dimension
+ * may \em not be referenced or undefined behaviour will occur. The
+ * reference to the matrix nonzero is non \a const and may thus be
+ * modified. New nonzeroes may \em not be added through this lambda
+ * functionality. The function \a f must have the following
+ * signature:
+ * (DataType &nz, const size_t i, const size_t j)
.
+ * The GraphBLAS implementation decides which nonzeroes of \a A are
+ * dereferenced, and thus also decides the values \a i and \a j the
+ * user function is evaluated on.
+ * @param[in] A The matrix the lambda is to access the elements of.
+ *
+ * The remainder arguments should enumerate all vectors the lambda is to access
+ * elements of. Each such vector must be of the same length as \a nrows(A) or
+ * \a ncols(A). If this constraint is violated, #grb::MISMATCH shall be returned.
+ * If a given vector length equals \a nrows(A), the vector shall be synchronized
+ * for access on \a i. If the vector length equals \a ncols(A), the vector shall
+ * be synchronized for access on \a j. If \a A is square, the vectors will be
+ * synchronised for access on both \a i \em and \a j.
+ *
+ * \note These vectors are passed using a variadic argument list and so may
+ * contain any number of containers of type #grb::Vector, potentially with
+ * differing nonzero types, as separate arguments.
+ *
+ * \warning Using a #grb::Vector inside a lambda passed to this function while
+ * not passing that same vector into the variadic argument list will
+ * result in undefined behaviour.
+ *
+ * \warning Due to the constraints on \a f described above, it is illegal to
+ * capture some vector \a y and have the following line in the body
+ * of \a f: x[i] += x[i+1]
. Vectors can only be
+ * dereferenced at position \a i and \a i alone, and similarly for
+ * access using \a j. For square matrices, however, the following
+ * code in the body is accepted, however: x[i] += x[j]
.
+ *
+ * @return grb::SUCCESS When the lambda is successfully executed.
+ * @return grb::MISMATCH When two or more vectors passed into the variadic
+ * argument list are not of appropriate length.
+ *
+ * \warning Captured scalars will be local to the user process executing the
+ * lambda. To retrieve the global dot product, an allreduce must
+ * explicitly be called.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
+ template<
+ typename Func, typename DataType,
+ typename RIT, typename CIT, typename NIT,
+ Backend implementation = config::default_backend,
+ typename... Args
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType, implementation, RIT, CIT, NIT > &A,
+ Args...
+ ) {
+#ifdef _DEBUG
+ std::cerr << "Selected backend does not implement grb::eWiseLambda (matrices)\n";
+#endif
+#ifndef NDEBUG
+ const bool selected_backend_does_not_support_matrix_eWiseLamba = false;
+ assert( selected_backend_does_not_support_matrix_eWiseLamba );
+#endif
+ (void) f;
+ (void) A;
+ return UNSUPPORTED;
+ }
+
+ // default (non-)implementations follow:
+
+ /**
+ * Right-handed in-place masked sparse matrix--vector multiplication,
+ * \f$ u = u + Av \f$, over a given semiring.
+ *
+ * See the documentation of #grb::mxv for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -247,7 +482,8 @@ namespace grb {
const Vector< InputType3, implementation, Coords > &mask,
const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
const Vector< InputType1, implementation, Coords > &v,
- const Ring &ring,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
typename std::enable_if<
grb::is_semiring< Ring >::value,
void >::type * = nullptr
@@ -259,20 +495,28 @@ namespace grb {
const bool backend_does_not_support_output_masked_mxv = false;
assert( backend_does_not_support_output_masked_mxv );
#endif
- (void)u;
- (void)mask;
- (void)A;
- (void)v;
- (void)ring;
+ (void) u;
+ (void) mask;
+ (void) A;
+ (void) v;
+ (void) ring;
return UNSUPPORTED;
}
/**
- * A short-hand for an unmasked #grb::mxv.
+ * Right-handed in-place sparse matrix--vector multiplication,
+ * \f$ u = u + Av \f$, over a given semiring.
*
- * @see grb::mxv for the full documentation.
+ * See the documentation of #grb::mxv for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
class Ring,
typename IOType, typename InputType1, typename InputType2,
typename Coords, typename RIT, typename CIT, typename NIT,
@@ -294,24 +538,24 @@ namespace grb {
const bool backend_does_not_support_mxv = false;
assert( backend_does_not_support_mxv );
#endif
- (void)u;
- (void)A;
- (void)v;
- (void)ring;
+ (void) u;
+ (void) A;
+ (void) v;
+ (void) ring;
return UNSUPPORTED;
}
/**
- * Left-handed sparse matrix times vector multiplication, \f$ u = vA \f$.
+ * Left-handed in-place masked sparse matrix--vector multiplication,
+ * \f$ u = u + vA \f$, over a given semiring.
*
- * If \a descr does not have #grb::descriptors::transpose_matrix defined, the
- * semantics and performance semantics of this function are exactly that of
- * grb::mxv with the #grb::descriptors::transpose_matrix set.
- * In the other case, the functional and performance semantics of this function
- * are exactly that of grb::mxv without the #grb::descriptors::transpose_matrix
- * set.
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
*
- * @see grb::mxv for the full documentation.
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -326,7 +570,8 @@ namespace grb {
const Vector< InputType3, implementation, Coords > &mask,
const Vector< InputType1, implementation, Coords > &v,
const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
- const Ring &ring,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
typename std::enable_if<
grb::is_semiring< Ring >::value, void
>::type * = nullptr
@@ -338,18 +583,26 @@ namespace grb {
const bool selected_backend_does_not_support_output_masked_vxm = false;
assert( selected_backend_does_not_support_output_masked_vxm );
#endif
- (void)u;
- (void)mask;
- (void)v;
- (void)A;
- (void)ring;
+ (void) u;
+ (void) mask;
+ (void) v;
+ (void) A;
+ (void) ring;
+ (void) phase;
return UNSUPPORTED;
}
/**
- * A short-hand for an unmasked grb::vxm.
+ * Left-handed in-place sparse matrix--vector multiplication,
+ * \f$ u = u + vA \f$, over a given semiring.
+ *
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
*
- * @see grb::vxm for the full documentation.
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -362,7 +615,8 @@ namespace grb {
Vector< IOType, implementation, Coords > &u,
const Vector< InputType1, implementation, Coords > &v,
const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
- const Ring &ring,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
typename std::enable_if<
grb::is_semiring< Ring >::value, void
>::type * = nullptr
@@ -374,14 +628,26 @@ namespace grb {
const bool selected_backend_does_not_support_vxm = false;
assert( selected_backend_does_not_support_vxm );
#endif
- (void)u;
- (void)v;
- (void)A;
- (void)ring;
+ (void) u;
+ (void) v;
+ (void) A;
+ (void) ring;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Left-handed in-place doubly-masked sparse matrix--vector multiplication,
+ * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -398,6 +664,7 @@ namespace grb {
const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
const AdditiveMonoid &add = AdditiveMonoid(),
const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
const typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -416,17 +683,29 @@ namespace grb {
const bool selected_backed_does_not_support_doubly_masked_vxm = false;
assert( selected_backed_does_not_support_doubly_masked_vxm );
#endif
- (void)u;
- (void)mask;
- (void)v;
- (void)v_mask;
- (void)A;
- (void)add;
- (void)mul;
+ (void) u;
+ (void) mask;
+ (void) v;
+ (void) v_mask;
+ (void) A;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Right-handed in-place doubly-masked sparse matrix--vector multiplication,
+ * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::mxv for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -443,6 +722,7 @@ namespace grb {
const Vector< InputType4, backend, Coords > &v_mask,
const AdditiveMonoid &add = AdditiveMonoid(),
const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
const typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -461,17 +741,29 @@ namespace grb {
const bool selected_backed_does_not_support_doubly_masked_mxv = false;
assert( selected_backed_does_not_support_doubly_masked_mxv );
#endif
- (void)u;
- (void)mask;
- (void)A;
- (void)v;
- (void)v_mask;
- (void)add;
- (void)mul;
+ (void) u;
+ (void) mask;
+ (void) A;
+ (void) v;
+ (void) v_mask;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Right-handed in-place masked sparse matrix--vector multiplication,
+ * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::mxv for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -487,6 +779,7 @@ namespace grb {
const Vector< InputType1, backend, Coords > &v,
const AdditiveMonoid & add = AdditiveMonoid(),
const MultiplicativeOperator & mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
const typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -505,16 +798,28 @@ namespace grb {
const bool selected_backed_does_not_support_masked_monop_mxv = false;
assert( selected_backed_does_not_support_masked_monop_mxv );
#endif
- (void)u;
- (void)mask;
- (void)A;
- (void)v;
- (void)add;
- (void)mul;
+ (void) u;
+ (void) mask;
+ (void) A;
+ (void) v;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Left-handed in-place sparse matrix--vector multiplication,
+ * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -528,6 +833,7 @@ namespace grb {
const Matrix< InputType2, backend, RIT, CIT, NIT > &A,
const AdditiveMonoid &add = AdditiveMonoid(),
const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
const typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -545,15 +851,27 @@ namespace grb {
const bool selected_backed_does_not_support_monop_vxm = false;
assert( selected_backed_does_not_support_monop_vxm );
#endif
- (void)u;
- (void)v;
- (void)A;
- (void)add;
- (void)mul;
+ (void) u;
+ (void) v;
+ (void) A;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Left-handed in-place masked sparse matrix--vector multiplication,
+ * \f$ u = u + vA \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -569,6 +887,7 @@ namespace grb {
const Matrix< InputType2, implementation, RIT, CIT, NIT > &A,
const AdditiveMonoid &add = AdditiveMonoid(),
const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -585,16 +904,28 @@ namespace grb {
const bool selected_backed_does_not_support_masked_monop_vxm = false;
assert( selected_backed_does_not_support_masked_monop_vxm );
#endif
- (void)u;
- (void)mask;
- (void)v;
- (void)A;
- (void)add;
- (void)mul;
+ (void) u;
+ (void) mask;
+ (void) v;
+ (void) A;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
- /** TODO documentation */
+ /**
+ * Right-handed in-place sparse matrix--vector multiplication,
+ * \f$ u = u + Av \f$, over a given commutative additive monoid and any
+ * binary operator acting as multiplication.
+ *
+ * See the documentation of #grb::vxm for the full specification of this
+ * function.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ */
template<
Descriptor descr = descriptors::no_operation,
class AdditiveMonoid, class MultiplicativeOperator,
@@ -608,6 +939,7 @@ namespace grb {
const Vector< InputType1, backend, Coords > &v,
const AdditiveMonoid &add = AdditiveMonoid(),
const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
const typename std::enable_if<
grb::is_monoid< AdditiveMonoid >::value &&
grb::is_operator< MultiplicativeOperator >::value &&
@@ -624,125 +956,11 @@ namespace grb {
const bool selected_backed_does_not_support_monop_mxv = false;
assert( selected_backed_does_not_support_monop_mxv );
#endif
- (void)u;
- (void)A;
- (void)v;
- (void)add;
- (void)mul;
- return UNSUPPORTED;
- }
-
- /**
- * Executes an arbitrary element-wise user-defined function \a f on all
- * nonzero elements of a given matrix \a A.
- *
- * The user-defined function is passed as a lambda which can capture whatever
- * the user would like, including one or multiple grb::Vector instances, or
- * multiple scalars. When capturing vectors, these should also be passed as a
- * additional arguments to this functions so to make sure those vectors are
- * synchronised for access on all row- and column- indices corresponding to
- * locally stored nonzeroes of \a A.
- *
- * Only the elements of a single matrix may be iterated upon.
- *
- * \note Rationale: while it is reasonable to expect an implementation be able
- * to synchronise vector elements, it may be unreasonable to expect two
- * different matrices can be jointly accessed via arbitrary lambda
- * functions.
- *
- * \warning The lambda shall only be executed on the data local to the user
- * process calling this function! This is different from the various
- * fold functions, or grb::dot, in that the semantics of those
- * functions always result in globally synchronised result. To
- * achieve the same effect with user-defined lambdas, the users
- * should manually prescribe how to combine the local results into
- * global ones, for instance, by subsequent calls to
- * grb::collectives.
- *
- * \note This is an addition to the GraphBLAS. It is alike user-defined
- * operators, monoids, and semirings, except it allows execution on
- * arbitrarily many inputs and arbitrarily many outputs.
- *
- * @tparam Func the user-defined lambda function type.
- * @tparam DataType the type of the user-supplied matrix.
- * @tparam backend the backend type of the user-supplied vector example.
- *
- * @param[in] f The user-supplied lambda. This lambda should only capture
- * and reference vectors of the same length as either the row or
- * column dimension length of \a A. The lambda function should
- * prescribe the operations required to execute on a given
- * reference to a matrix nonzero of \a A (of type \a DataType) at
- * a given index \f$ (i,j) \f$. Captured GraphBLAS vectors can
- * access corresponding elements via Vector::operator[] or
- * Vector::operator(). It is illegal to access any element not at
- * position \a i if the vector length is equal to the row
- * dimension. It is illegal to access any element not at position
- * \a j if the vector length is equal to the column dimension.
- * Vectors of length neither equal to the column or row dimension
- * may \em not be referenced or undefined behaviour will occur. The
- * reference to the matrix nonzero is non \a const and may thus be
- * modified. New nonzeroes may \em not be added through this lambda
- * functionality. The function \a f must have the following
- * signature:
- * (DataType &nz, const size_t i, const size_t j)
.
- * The GraphBLAS implementation decides which nonzeroes of \a A are
- * dereferenced, and thus also decides the values \a i and \a j the
- * user function is evaluated on.
- * @param[in] A The matrix the lambda is to access the elements of.
- * @param[in] args All vectors the lambda is to access elements of. Must be of
- * the same length as \a nrows(A) or \a ncols(A). If this
- * constraint is violated, grb::MISMATCH shall be returned. If
- * the vector length equals \a nrows(A), the vector shall be
- * synchronized for access on \a i. If the vector length equals
- * \a ncols(A), the vector shall be synchronized for access on
- * \a j. If \a A is square, the vectors will be synchronised for
- * access on both \a x and \a y. This is a variadic argument
- * and can contain any number of containers of type grb::Vector,
- * passed as though they were separate arguments.
- *
- * \warning Using a grb::Vector inside a lambda passed to this function while
- * not passing that same vector into \a args, will result in undefined
- * behaviour.
- *
- * \warning Due to the constraints on \a f described above, it is illegal to
- * capture some vector \a y and have the following line in the body
- * of \a f: x[i] += x[i+1]
. Vectors can only be
- * dereferenced at position \a i and \a i alone, and similarly for
- * access using \a j. For square matrices, however, the following
- * code in the body is accepted, however: x[i] += x[j]
.
- *
- * @return grb::SUCCESS When the lambda is successfully executed.
- * @return grb::MISMATCH When two or more vectors passed to \a args are not of
- * appropriate length.
- *
- * \warning Captured scalars will be local to the user process executing the
- * lambda. To retrieve the global dot product, an allreduce must
- * explicitly be called.
- *
- * @see Vector::operator[]()
- * @see Vector::operator()()
- * @see Vector::lambda_reference
- */
- template<
- typename Func, typename DataType,
- typename RIT, typename CIT, typename NIT,
- Backend implementation = config::default_backend,
- typename... Args
- >
- RC eWiseLambda(
- const Func f,
- const Matrix< DataType, implementation, RIT, CIT, NIT > &A,
- Args... /*args*/
- ) {
-#ifdef _DEBUG
- std::cerr << "Selected backend does not implement grb::eWiseLambda (matrices)\n";
-#endif
-#ifndef NDEBUG
- const bool selected_backend_does_not_support_matrix_eWiseLamba = false;
- assert( selected_backend_does_not_support_matrix_eWiseLamba );
-#endif
- (void)f;
- (void)A;
+ (void) u;
+ (void) A;
+ (void) v;
+ (void) add;
+ (void) mul;
return UNSUPPORTED;
}
diff --git a/include/graphblas/base/blas3.hpp b/include/graphblas/base/blas3.hpp
index 02965eee4..2aab1be2a 100644
--- a/include/graphblas/base/blas3.hpp
+++ b/include/graphblas/base/blas3.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-3 API
+ *
* @author A. N. Yzelman
*/
@@ -28,10 +32,12 @@
#include "matrix.hpp"
#include "vector.hpp"
+
namespace grb {
/**
- * \defgroup BLAS3 The Level-3 Basic Linear Algebra Subroutines (BLAS)
+ * \defgroup BLAS3 Level-3 Primitives
+ * \ingroup GraphBLAS
*
* A collection of functions that allow GraphBLAS semirings to work on
* one or more two-dimensional sparse containers (i.e, sparse matrices).
@@ -40,9 +46,11 @@ namespace grb {
*/
/**
- * Unmaked sparse matrix--sparse matrix multiplication (SpMSpM).
+ * Unmasked and in-place sparse matrix--sparse matrix multiplication (SpMSpM),
+ * \f$ C += A+B \f$.
*
* @tparam descr The descriptors under which to perform the computation.
+ * Optional; default is #grb::descriptors::no_operation.
* @tparam OutputType The type of elements in the output matrix.
* @tparam InputType1 The type of elements in the left-hand side input
* matrix.
@@ -50,23 +58,31 @@ namespace grb {
* matrix.
* @tparam Semiring The semiring under which to perform the
* multiplication.
- * @tparam Backend The backend that should perform the computation.
- *
- * @returns SUCCESS If the computation completed as intended.
- * @returns FAILED If the call was not not preceded by one to
- * #grb::resize( C, A, B ); \em and the current capacity of
- * \a C was insufficient to store the multiplication of \a A
- * and \a B. The contents of \a C shall be undefined (which
- * is why #FAILED is returned instead of #ILLEGAL-- this
- * error has side effects).
- *
- * @param[out] C The output matrix \f$ C = AB \f$ when the function returns
- * #SUCCESS.
- * @param[in] A The left-hand side input matrix \f$ A \f$.
- * @param[in] B The left-hand side input matrix \f$ B \f$.
- *
- * @param[in] ring (Optional.) The semiring under which the computation should
- * proceed.
+ *
+ * @param[in,out] C The matrix into which the multiplication \f$ AB \f$ is
+ * accumulated.
+ * @param[in] A The left-hand side input matrix \f$ A \f$.
+ * @param[in] B The left-hand side input matrix \f$ B \f$.
+ *
+ * @param[in] ring The semiring under which the computation should
+ * proceed.
+ * @param[in] phase The #grb::Phase the primitive should be executed with. This
+ * argument is optional; its default is #grb::EXECUTE.
+ *
+ * @return #grb::SUCCESS If the computation completed as intended.
+ * @return #grb::FAILED If the capacity of \a C was insufficient to store the
+ * output of multiplying \a A and \a B. If this code is
+ * returned, \a C on output appears cleared.
+ * @return #grb::OUTOFMEM If \a phase is #grb::RESIZE and an out-of-error
+ * condition arose while resizing \a C.
+ *
+ * \note This specification does not account for #grb::TRY as that phase is
+ * still experimental. See its documentation for details.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -83,63 +99,87 @@ namespace grb {
const Phase &phase = EXECUTE
) {
#ifdef _DEBUG
- std::cerr << "Selected backend does not implement grb::mxm (semiring version)\n";
+ std::cerr << "Selected backend does not implement grb::mxm "
+ << "(semiring version)\n";
#endif
#ifndef NDEBUG
const bool selected_backend_does_not_support_mxm = false;
assert( selected_backend_does_not_support_mxm );
#endif
- (void)C;
- (void)A;
- (void)B;
- (void)ring;
- (void)phase;
+ (void) C;
+ (void) A;
+ (void) B;
+ (void) ring;
+ (void) phase;
// this is the generic stub implementation
return UNSUPPORTED;
}
/**
- * Interprets three vectors x, y, and z as a series of row coordinates,
- * column coordinates, and nonzeroes, respectively, and stores the thus
- * defined nonzeroes in a given output matrix A.
+ * The #grb::zip merges three vectors into a matrix.
+ *
+ * Interprets three input vectors \a x, \a y, and \a z as a series of row
+ * coordinates, column coordinates, and nonzeroes, respectively. The
+ * thus-defined nonzeroes of a matrix are then stored in a given output
+ * matrix \a A.
+ *
+ * The vectors \a x, \a y, and \a z must have equal length, as well as the same
+ * number of nonzeroes. If the vectors are sparse, all vectors must have the
+ * same sparsity structure.
*
- * If this function does not return SUCCESS, A will have been cleared.
+ * \note A variant of this function only takes \a x and \a y, and has that the
+ * output matrix \a A has void element types.
*
- * A must have been pre-allocated to store the nonzero pattern the three
- * given vectors x, y, and z encode, or ILLEGAL shall be returned.
+ * If this function does not return #grb::SUCCESS, the output \ a A will have
+ * no contents on function exit.
*
- * \note A call to this function hence must be preceded by a successful
- * call to grb::resize( matrix, nnz );
+ * The matrix \a A must have been pre-allocated to store the nonzero pattern
+ * that the three given vectors \a x, \a y, and \a z encode, or otherwise this
+ * function returns #grb::ILLEGAL.
*
- * @param[out] A The output matrix
- * @param[in] x A vector of row indices.
- * @param[in] y A vector of column indices.
- * @param[in] z A vector of nonzero values.
+ * \note To ensure that the capacity of \a A is sufficient, a succesful call to
+ * #grb::resize with #grb::nnz of \a x suffices. Alternatively, and with
+ * the same effect, a succesful call to this function with \a phase equal
+ * to #grb::RESIZE instead of #grb::SUCCESS suffices also.
*
- * If x, y, and z are sparse, they must have the exact same sparsity
- * structure.
+ * @param[out] A The output matrix.
+ * @param[in] x A vector of row indices.
+ * @param[in] y A vector of column indices.
+ * @param[in] z A vector of nonzero values.
+ * @param[in] phase The #grb::Phase in which the primitive is to proceed.
+ * Optional; the default is #grb::EXECUTE.
*
+ * @return #grb::SUCCESS If \a A was constructed successfully.
+ * @return #grb::MISMATCH If \a y or \a z does not match the size of \a x.
+ * @return #grb::ILLEGAL If \a y or \a z do not have the same number of
+ * nonzeroes as \a x.
+ * @return #grb::ILLEGAL If \a y or \a z has a different sparsity pattern from
+ * \a x.
+ * @return #grb::FAILED If the capacity of \a A was insufficient to store the
+ * given sparsity pattern and \a phase is #grb::EXECUTE.
+ * @return #grb::OUTOFMEM If the \a phase is #grb::RESIZE and \a A could not be
+ * resized to have sufficient capacity to complete this
+ * function due to out-of-memory conditions.
+ *
+ * \parblock
* \par Descriptors
*
* None allowed.
+ * \endparblock
*
- * @returns SUCCESS If A was constructed successfully.
- * @returns MISMATCH If y or z does not match the size of x.
- * @returns ILLEGAL If y or z do not have the same number of nonzeroes
- * as x.
- * @returns ILLEGAL If y or z has a different sparsity pattern from x.
- * @returns ILLEGAL If the capacity of A was insufficient to store the
- * given sparsity pattern.
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
*
- * @see grb::resize
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
- typename OutputType, typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename InputType3, typename RIT, typename CIT, typename NIT,
Backend backend, typename Coords
>
RC zip(
- Matrix< OutputType, backend > &A,
+ Matrix< OutputType, backend, RIT, CIT, NIT > &A,
const Vector< InputType1, backend, Coords > &x,
const Vector< InputType2, backend, Coords > &y,
const Vector< InputType3, backend, Coords > &z,
@@ -150,7 +190,8 @@ namespace grb {
(void) z;
(void) phase;
#ifdef _DEBUG
- std::cerr << "Selected backend does not implement grb::zip (vectors into matrices, non-void)\n";
+ std::cerr << "Selected backend does not implement grb::zip (vectors into "
+ << "matrices, non-void)\n";
#endif
#ifndef NDEBUG
const bool selected_backend_does_not_support_zip_from_vectors_to_matrix
@@ -162,16 +203,25 @@ namespace grb {
}
/**
- * Specialisation of grb::zip for void output matrices.
+ * Merges two vectors into a void matrix.
+ *
+ * This is a specialisation of #grb::zip for pattern matrices. The two input
+ * vectors \a x and \a y represent coordinates of nonzeroes to be stored in
+ * \a A.
+ *
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
*/
template<
Descriptor descr = descriptors::no_operation,
typename InputType1, typename InputType2, typename InputType3,
- Backend backend,
- typename Coords
+ typename RIT, typename CIT, typename NIT,
+ Backend backend, typename Coords
>
RC zip(
- Matrix< void, backend > &A,
+ Matrix< void, backend, RIT, CIT, NIT > &A,
const Vector< InputType1, backend, Coords > &x,
const Vector< InputType2, backend, Coords > &y,
const Phase &phase = EXECUTE
@@ -180,7 +230,8 @@ namespace grb {
(void) y;
(void) phase;
#ifdef _DEBUG
- std::cerr << "Selected backend does not implement grb::zip (vectors into matrices, void)\n";
+ std::cerr << "Selected backend does not implement grb::zip (vectors into "
+ << "matrices, void)\n";
#endif
#ifndef NDEBUG
const bool selected_backend_does_not_support_zip_from_vectors_to_void_matrix
diff --git a/include/graphblas/base/collectives.hpp b/include/graphblas/base/collectives.hpp
index d9af31523..a77638fed 100644
--- a/include/graphblas/base/collectives.hpp
+++ b/include/graphblas/base/collectives.hpp
@@ -15,7 +15,12 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies some basic collectives which may be used within a multi-process
+ * ALP program.
+ *
* @author A. N. Yzelman & J. M. Nash
* @date 20th of February, 2017
*/
@@ -27,6 +32,7 @@
#include
#include
+
namespace grb {
/**
@@ -39,219 +45,229 @@ namespace grb {
template< enum Backend implementation >
class collectives {
- private:
- /** Disallow creating an instance. */
- collectives() {}
+ private:
- public:
- /**
- * Schedules an allreduce operation of a single object of type IOType per
- * process. The allreduce shall be complete by the end of the call. This is a
- * collective graphBLAS operation. After the collective call finishes, each
- * user process will locally have available the allreduced value.
- *
- * Since this is a collective call, there are \a P values \a inout spread over
- * all user processes. Let these values be denoted by \f$ x_s \f$, with
- * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
- * argument \a inout on input at the user process with ID \a s. Let
- * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
- * bijection, some unknown permutation of the process ID. This permutation is
- * must be fixed for any given combination of GraphBLAS implementation and value
- * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
- *
- * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
- * exact same result to \a inout at each of the \a P user processes.
- *
- * In summary, this means 1) this operation is coherent across all processes and
- * produces bit-wise equivalent output on all user processes, and 2) the result
- * is reproducible across different runs using the same input and \a P. Yet it
- * does \em not mean that the order of addition is fixed.
- *
- * Since each user process supplies but one value, there is no difference
- * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
- * and grb::reducer).
- *
- * @tparam descr The GraphBLAS descriptor.
- * Default is grb::descriptors::no_operation.
- * @tparam Operator Which operator to use for reduction.
- * @tparam IOType The type of the to-be reduced value.
- *
- * @param[in,out] inout On input: the value at the calling process to be
- * reduced. On output: the reduced value.
- * @param[in] op The associative operator to reduce by.
- *
- * \note If \op is commutative, the implementation free to employ a different
- * allreduce algorithm, as long as it is documented well enough so that
- * its cost can be quantified.
- *
- * @returns grb::SUCCESS When the operation succeeds as planned.
- * @returns grb::PANIC When the communication layer unexpectedly fails. When
- * this error code is returned, the library enters an
- * undefined state.
- *
- * \parblock
- * \par Valid descriptors:
- * -# grb::descriptors::no_operation
- * -# grb::descriptors::no_casting
- * Any other descriptors will be ignored.
- * \endparblock
- *
- * \parblock
- * \par Performance semantics:
- * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
- * -# local work: \f$ N*Operator \f$ ;
- * -# transferred bytes: \f$ N \f$ ;
- * -# BSP cost: \f$ Ng + N*Operator + l \f$;
- * \endparblock
- */
- template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
- static RC allreduce( IOType & inout, const Operator op = Operator() ) {
- (void)inout;
- (void)op;
- return PANIC;
- }
+ /** Disallow creating an instance. */
+ collectives() {}
- /**
- * Schedules a reduce operation of a single object of type IOType per process.
- * The reduce shall be complete by the end of the call. This is a collective
- * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
- *
- * Since this is a collective call, there are \a P values \a inout spread over
- * all user processes. Let these values be denoted by \f$ x_s \f$, with
- * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
- * argument \a inout on input at the user process with ID \a s. Let
- * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
- * bijection, some unknown permutation of the process ID. This permutation is
- * must be fixed for any given combination of GraphBLAS implementation and value
- * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
- *
- * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
- * result to \a inout at the user process with ID \a root.
- *
- * In summary, this the result is reproducible across different runs using the
- * same input and \a P. Yet it does \em not mean that the order of addition is
- * fixed.
- *
- * Since each user process supplies but one value, there is no difference
- * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
- * and grb::reducer).
- *
- * @tparam descr The GraphBLAS descriptor.
- * Default is grb::descriptors::no_operation.
- * @tparam Operator Which operator to use for reduction.
- * @tparam IOType The type of the to-be reduced value.
- *
- * @param[in,out] inout On input: the value at the calling process to be
- * reduced. On output at process \a root: the reduced value.
- * On output as non-root processes: same value as on input.
- * @param[in] op The associative operator to reduce by.
- * @param[in] root Which process should hold the reduced value. This
- * number must be larger or equal to zero, and must be
- * strictly smaller than the number of user processes
- * \a P.
- *
- * @return SUCCESS When the function completes successfully.
- * @return ILLEGAL When root is larger or equal than \a P. When this code is
- * returned, the state of the GraphBLAS shall be as though
- * this call was never made.
- * @return PANIC When an unmitigable error within the GraphBLAS occurs.
- * Upon returning this error, the GraphBLAS enters an
- * undefined state.
- *
- * \note If \op is commutative, the implementation free to employ a different
- * allreduce algorithm, as long as it is documented well enough so that
- * its cost can be quantified.
- *
- * \parblock
- * \par Performance semantics:
- * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
- * -# local work: \f$ N*Operator \f$ ;
- * -# transferred bytes: \f$ N \f$ ;
- * -# BSP cost: \f$ Ng + N*Operator + l \f$;
- * \endparblock
- */
- template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
- static RC reduce( IOType & inout, const size_t root = 0, const Operator op = Operator() ) {
- (void)inout;
- (void)op;
- (void)root;
- return PANIC;
- }
- /**
- * Schedules a broadcast operation of a single object of type IOType per
- * process. The broadcast shall be complete by the end of the call. This is
- * a collective graphBLAS operation. The BSP costs are as for the PlatformBSP
- * #broadcast.
- *
- * @tparam IOType The type of the to-be broadcast value.
- *
- * @param[in,out] inout On input at process \a root: the value to be
- * broadcast.
- * On input at non-root processes: initial values are
- * ignored.
- * On output at process \a root: the input value remains
- * unchanged.
- * On output at non-root processes: the same value held
- * at process ID \a root.
- * @param[in] root The user process which is to send out the given input
- * value \a inout so that it becomes available at all
- * \a P user processes. This value must be larger or
- * equal to zero and must be smaller than the total
- * number of user processes \a P.
- *
- * @return SUCCESS On the successful completion of this function.
- * @return ILLEGAL When \a root is larger or equal to \a P. If this code is
- * returned, it shall be as though the call to this function
- * had never occurred.
- * return PANIC When the function fails and the library enters an
- * undefined state.
- *
- * \parblock
- * \par Performance semantics: serial
- * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
- * -# local work: \f$ 0 \f$ ;
- * -# transferred bytes: \f$ NP \f$ ;
- * -# BSP cost: \f$ NPg + l \f$;
- * \endparblock
- *
- * \par Performance semantics: two phase
- * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
- * -# local work: \f$ 0 \f$ ;
- * -# transferred bytes: \f$ 2N \f$ ;
- * -# BSP cost: \f$ 2(Ng + l) \f$;
- * \endparblock
- *
- * \par Performance semantics: two level tree
- * -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
- * -# local work: \f$ 0 \f$ ;
- * -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
- * -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
- * \endparblock
- */
- template< typename IOType >
- static RC broadcast( IOType &inout, const size_t root = 0 ) {
- (void)inout;
- (void)root;
- return PANIC;
- }
+ public:
- /**
- * Broadcast on an array of \a IOType.
- *
- * The above documentation applies with \a size times sizeof(IOType)
- * substituted in.
- */
- template< Descriptor descr = descriptors::no_operation, typename IOType >
- static RC broadcast( IOType * inout, const size_t size, const size_t root = 0 ) {
- (void)inout;
- (void)size;
- (void)root;
- return PANIC;
- }
+ /**
+ * Schedules an allreduce operation of a single object of type IOType per
+ * process. The allreduce shall be complete by the end of the call. This is a
+ * collective graphBLAS operation. After the collective call finishes, each
+ * user process will locally have available the allreduced value.
+ *
+ * Since this is a collective call, there are \a P values \a inout spread over
+ * all user processes. Let these values be denoted by \f$ x_s \f$, with
+ * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+ * argument \a inout on input at the user process with ID \a s. Let
+ * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+ * bijection, some unknown permutation of the process ID. This permutation is
+ * must be fixed for any given combination of GraphBLAS implementation and value
+ * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+ *
+ * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+ * exact same result to \a inout at each of the \a P user processes.
+ *
+ * In summary, this means 1) this operation is coherent across all processes and
+ * produces bit-wise equivalent output on all user processes, and 2) the result
+ * is reproducible across different runs using the same input and \a P. Yet it
+ * does \em not mean that the order of addition is fixed.
+ *
+ * Since each user process supplies but one value, there is no difference
+ * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
+ * and grb::reducer).
+ *
+ * @tparam descr The GraphBLAS descriptor.
+ * Default is grb::descriptors::no_operation.
+ * @tparam Operator Which operator to use for reduction.
+ * @tparam IOType The type of the to-be reduced value.
+ *
+ * @param[in,out] inout On input: the value at the calling process to be
+ * reduced. On output: the reduced value.
+ * @param[in] op The associative operator to reduce by.
+ *
+ * \note If \a op is commutative, the implementation free to employ a
+ * different allreduce algorithm, as long as it is documented well
+ * enough so that its cost can be quantified.
+ *
+ * @returns grb::SUCCESS When the operation succeeds as planned.
+ * @returns grb::PANIC When the communication layer unexpectedly fails. When
+ * this error code is returned, the library enters an
+ * undefined state.
+ *
+ * \parblock
+ * \par Valid descriptors:
+ * -# grb::descriptors::no_operation
+ * -# grb::descriptors::no_casting
+ * Any other descriptors will be ignored.
+ * \endparblock
+ *
+ * \parblock
+ * \par Performance semantics:
+ * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+ * -# local work: \f$ N*Operator \f$ ;
+ * -# transferred bytes: \f$ N \f$ ;
+ * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+ * \endparblock
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Operator,
+ typename IOType
+ >
+ static RC allreduce( IOType &inout, const Operator op = Operator() ) {
+ (void) inout;
+ (void)op;
+ return PANIC;
+ }
+
+ /**
+ * Schedules a reduce operation of a single object of type IOType per process.
+ * The reduce shall be complete by the end of the call. This is a collective
+ * graphBLAS operation. The BSP costs are as for the PlatformBSP #reduce.
+ *
+ * Since this is a collective call, there are \a P values \a inout spread over
+ * all user processes. Let these values be denoted by \f$ x_s \f$, with
+ * \f$ s \in \{ 0, 1, \ldots, P-1 \}, \f$ such that \f$ x_s \f$ equals the
+ * argument \a inout on input at the user process with ID \a s. Let
+ * \f$ \pi:\ \{ 0, 1, \ldots, P-1 \} \to \{ 0, 1, \ldots, P-1 \} \f$ be a
+ * bijection, some unknown permutation of the process ID. This permutation is
+ * must be fixed for any given combination of GraphBLAS implementation and value
+ * \a P. Let the binary operator \a op be denoted by \f$ \odot \f$.
+ *
+ * This function computes \f$ \odot_{i=0}^{P-1} x_{\pi(i)} \f$ and writes the
+ * result to \a inout at the user process with ID \a root.
+ *
+ * In summary, this the result is reproducible across different runs using the
+ * same input and \a P. Yet it does \em not mean that the order of addition is
+ * fixed.
+ *
+ * Since each user process supplies but one value, there is no difference
+ * between a reduce-to-the-left versus a reduce-to-the-right (see grb::reducel
+ * and grb::reducer).
+ *
+ * @tparam descr The GraphBLAS descriptor.
+ * Default is grb::descriptors::no_operation.
+ * @tparam Operator Which operator to use for reduction.
+ * @tparam IOType The type of the to-be reduced value.
+ *
+ * @param[in,out] inout On input: the value at the calling process to be
+ * reduced. On output at process \a root: the reduced value.
+ * On output as non-root processes: same value as on input.
+ * @param[in] op The associative operator to reduce by.
+ * @param[in] root Which process should hold the reduced value. This
+ * number must be larger or equal to zero, and must be
+ * strictly smaller than the number of user processes
+ * \a P.
+ *
+ * @return SUCCESS When the function completes successfully.
+ * @return ILLEGAL When root is larger or equal than \a P. When this code is
+ * returned, the state of the GraphBLAS shall be as though
+ * this call was never made.
+ * @return PANIC When an unmitigable error within the GraphBLAS occurs.
+ * Upon returning this error, the GraphBLAS enters an
+ * undefined state.
+ *
+ * \note If \a op is commutative, the implementation free to employ a
+ * different allreduce algorithm, as long as the performance semantics
+ * are documented so that its cost can be quantified.
+ *
+ * \parblock
+ * \par Performance semantics:
+ * -# Problem size N: \f$ P * \mathit{sizeof}(\mathit{IOType}) \f$
+ * -# local work: \f$ N*Operator \f$ ;
+ * -# transferred bytes: \f$ N \f$ ;
+ * -# BSP cost: \f$ Ng + N*Operator + l \f$;
+ * \endparblock
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Operator,
+ typename IOType
+ >
+ static RC reduce(
+ IOType &inout,
+ const size_t root = 0,
+ const Operator op = Operator()
+ ) {
+ (void) inout;
+ (void) op;
+ (void) root;
+ return PANIC;
+ }
+
+ /**
+ * Schedules a broadcast operation of a single object of type IOType per
+ * process. The broadcast shall be complete by the end of the call. This is
+ * a collective graphBLAS operation. The BSP costs are as for the PlatformBSP
+ * #broadcast.
+ *
+ * @tparam IOType The type of the to-be broadcast value.
+ *
+ * @param[in,out] inout On input at process \a root: the value to be
+ * broadcast.
+ * On input at non-root processes: initial values are
+ * ignored.
+ * On output at process \a root: the input value remains
+ * unchanged.
+ * On output at non-root processes: the same value held
+ * at process ID \a root.
+ * @param[in] root The user process which is to send out the given input
+ * value \a inout so that it becomes available at all
+ * \a P user processes. This value must be larger or
+ * equal to zero and must be smaller than the total
+ * number of user processes \a P.
+ *
+ * @return SUCCESS On the successful completion of this function.
+ * @return ILLEGAL When \a root is larger or equal to \a P. If this code is
+ * returned, it shall be as though the call to this function
+ * had never occurred.
+ * return PANIC When the function fails and the library enters an
+ * undefined state.
+ *
+ * \parblock
+ * \par Performance semantics
+ * Backends should define performance semantics in terms of work and data
+ * movement, the latter both within and between user processes. Also the
+ * number of synchronisations between user processes must be quantified.
+ *
+ * Backends furthermore must indicate whether system calls may occur during a
+ * call to this primitive, indicate whether additional dynamic may be
+ * allocated (and if so, when it is freed), and quantify the required work
+ * space.
+ * \endparblock
+ */
+ template< typename IOType >
+ static RC broadcast( IOType &inout, const size_t root = 0 ) {
+ (void) inout;
+ (void) root;
+ return PANIC;
+ }
+
+ /**
+ * Broadcast on an array of \a IOType.
+ *
+ * The above documentation applies with \a size times sizeof(IOType)
+ * substituted in.
+ */
+ template< Descriptor descr = descriptors::no_operation, typename IOType >
+ static RC broadcast(
+ IOType * inout,
+ const size_t size,
+ const size_t root = 0
+ ) {
+ (void) inout;
+ (void) size;
+ (void) root;
+ return PANIC;
+ }
}; // end class ``collectives''
} // end namespace grb
#endif // end _H_GRB_COLL_BASE
+
diff --git a/include/graphblas/base/config.hpp b/include/graphblas/base/config.hpp
index 353b4ed90..f7796c852 100644
--- a/include/graphblas/base/config.hpp
+++ b/include/graphblas/base/config.hpp
@@ -15,7 +15,12 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Defines both configuration parameters effective for all backends, as
+ * well as defines structured ways of passing backend-specific parameters.
+ *
* @author A. N. Yzelman
* @date 8th of August, 2016
*/
@@ -41,20 +46,43 @@
#define _GRB_BACKEND reference
#endif
-/**
- * The main GraphBLAS namespace.
- *
- * All GraphBLAS functions and objects are defined within.
- */
+
namespace grb {
- /** Contains compile-time configuration constants. */
+ /**
+ * Compile-time configuration constants as well as implementation details that
+ * are derived from such settings.
+ */
namespace config {
- /** The default backend to be selected for an end user. */
+ /**
+ * \defgroup config Configuration
+ *
+ * This module collects all configuration settings.
+ */
+
+ /**
+ * \defgroup commonConfig Common configuration settings
+ * \ingroup config
+ *
+ * Configuration elements contained in this group affect all backends.
+ *
+ * @{
+ */
+
+ /**
+ * \internal
+ * The default backend to be selected for an end user.
+ * \ingroup config
+ * \endinternal
+ */
static constexpr grb::Backend default_backend = _GRB_BACKEND;
- /** The cache line size, in bytes. */
+ /**
+ * Contains information about the target architecture cache line size.
+ *
+ * \ingroup config
+ */
class CACHE_LINE_SIZE {
private:
@@ -68,15 +96,22 @@ namespace grb {
public:
/**
+ * \internal
* @return The cache line size in bytes.
* @see grb::config::CACHE_LINE_SIZE::bytes
+ * \endinternal
*/
static constexpr size_t value() {
return bytes;
}
+
};
- /** The SIMD size, in bytes. */
+ /**
+ * The SIMD size, in bytes.
+ *
+ * \ingroup config
+ */
class SIMD_SIZE {
private:
@@ -90,8 +125,10 @@ namespace grb {
public:
/**
+ * \internal
* @return The SIMD size in bytes.
* @see grb::config::SIMD_SIZE::bytes
+ * \endinternal
*/
static constexpr size_t value() {
return bytes;
@@ -99,25 +136,34 @@ namespace grb {
};
- /** How many elements of a given data type fit into a SIMD register. */
+ /**
+ * \internal
+ * How many elements of a given data type fit into a SIMD register.
+ * \ingroup config
+ * \endinternal
+ */
template< typename T >
class SIMD_BLOCKSIZE {
public:
/**
+ * \internal
* Calculates the block size this operator should use.
*
* \warning This rounds down. If instances of T are too large, this could
* result in a zero value. See #value for a correction.
+ * \endinternal
*/
static constexpr size_t unsafe_value() {
return SIMD_SIZE::value() / sizeof( T );
}
/**
+ * \internal
* The maximum of one and the number of elements that fit into a single
* cache line.
+ * \endinternal
*/
static constexpr size_t value() {
return unsafe_value() > 0 ? unsafe_value() : 1;
@@ -126,63 +172,81 @@ namespace grb {
};
/**
+ * \internal
* How many hardware threads the operating system exposes.
*
* \warning On contemporary x86-based hardware, the reported number by
* value() will include that of each hyper-thread. This number
* thus does not necessarily equal the number of cores available.
+ *
+ * \ingroup config
+ * \endinternal
*/
class HARDWARE_THREADS {
public:
/**
- * Returns the number of online hardware threads as reported by the OS.
+ * \internal
+ * Returns the number of online hardware threads as reported by the
+ * operating system.
*
* \warning This is a UNIX system call.
*
* @returns The number of hardware threads currently online. The return
* type is specified by the UNIX standard.
+ * \endinternal
*/
static long value() {
return sysconf( _SC_NPROCESSORS_ONLN );
}
- };
+ };
- /** Benchmarking defaults. */
+ /**
+ * Benchmarking default configuration parameters.
+ *
+ * \ingroup config
+ */
class BENCHMARKING {
public:
- /** The default number of inner repititions. */
+ /** @returns The default number of inner repetitions. */
static constexpr size_t inner() {
return 1;
}
- /** The default number of outer repititions. */
+ /** @returns The default number of outer repetitions. */
static constexpr size_t outer() {
return 10;
}
};
- /** Memory defaults. */
+ /**
+ * Memory configuration parameters.
+ *
+ * \ingroup config
+ */
class MEMORY {
public:
- /** The private L1 data cache size, in bytes. */
+ /** @returns the private L1 data cache size, in bytes. */
static constexpr size_t l1_cache_size() {
return 32768;
}
- /** What is considered a lot of memory, in 2-log of bytes. */
+ /**
+ * @returns What is considered a lot of memory, in 2-log of bytes.
+ */
static constexpr size_t big_memory() {
return 31;
} // 2GB
/**
+ * \internal
* The memory speed under random accesses of 8-byte words.
*
* @returns The requested speed in MiB/s/process.
@@ -196,12 +260,18 @@ namespace grb {
* much between architectures. Nevertheless, for best results, these
* numbers are best set to benchmarked values on the deployment
* hardware.
+ *
+ * @note Preliminary experiments have not resulted in a decisive gain from
+ * using this parameter, and hence it is currently not used by any
+ * backend.
+ * \endinternal
*/
static constexpr double random_access_memspeed() {
return 147.298;
}
/**
+ * \internal
* The memory speed under a limited number of streams of uncached data.
*
* @returns The requested speed in MiB/s/process.
@@ -215,15 +285,22 @@ namespace grb {
* much between architectures. Nevertheless, for best results, these
* numbers are best set to benchmarked values on the deployment
* hardware.
+ *
+ * @note Preliminary experiments have not resulted in a decisive gain from
+ * using this parameter, and hence it is currently not used by any
+ * backend.
+ * \endinternal
*/
static constexpr double stream_memspeed() {
return 1931.264;
}
/**
+ * \internal
* Prints memory usage info to stdout, but only for big memory allocations.
*
* @returns true if and only if this function printed information to stdout.
+ * \endinternal
*/
static bool report(
const std::string prefix, const std::string action,
@@ -268,13 +345,82 @@ namespace grb {
};
/**
- * Configuration parameters that may depend on the backend.
+ * Collects a series of implementation choices corresponding to some given
+ * \a backend.
+ *
+ * These implementation choices are useful for \em compositional backends;
+ * i.e., backends that rely on a nested sub-backend for functionality. To
+ * facilitate composability, backends are required to provide the functions
+ * specified herein.
+ *
+ * \note An example are the #grb::BSP1D and #grb::hybrid backends, that both
+ * share the exact same code, relying on either the #grb::reference or
+ * the #grb::reference_omp backend, respectively.
+ *
+ * \note The user documentation does not list all required fields; for a
+ * complete overview, see the developer documentation instead.
+ *
+ * The default class declaration is declared empty to ensure no one backend
+ * implicitly relies on global defaults. Every backend therefore must
+ * specialise this class and implement the specified functions.
*
- * Empty by default so to ensure no-one implicitly relies on implicit
- * defaults.
+ * \warning Portable ALP user code does not rely on the implementation details
+ * gathered in this class.
+ *
+ * \note For properties of a backend that may (also) affect ALP user code,
+ * see #grb::Properties.
+ *
+ * The user documentation only documents the settings that could be useful to
+ * modify.
+ *
+ * \warning Modifying the documented functions should be done with care.
+ *
+ * \warning Any such modifications typically requires rebuilding the ALP
+ * library itself.
+ *
+ * \note For viewing all implementation choices, please see the developer
+ * documentation.
+ *
+ * \ingroup config
*/
- template< grb::Backend implementation = default_backend >
- class IMPLEMENTATION {};
+ template< grb::Backend backend = default_backend >
+ class IMPLEMENTATION {
+#ifdef __DOXYGEN__
+ public:
+
+ /**
+ * Defines how private memory regions are allocated.
+ *
+ * @returns how a memory region that will not be accessed by threads other
+ * than the allocating thread, should be allocated.
+ */
+ static constexpr ALLOC_MODE defaultAllocMode();
+
+ /**
+ * Defines how shared memory regions are allocated.
+ *
+ * @returns how a memory region that may be accessed by thread other than
+ * the allocating thread, should be allocated.
+ */
+ static constexpr ALLOC_MODE sharedAllocMode();
+
+ /**
+ * \internal
+ * @returns whether the selected backend implements vectors as having fixed
+ * capacities. This is \em not a configuration choice for most backends,
+ * but rather a fixed consequence of design choices.
+ *
+ * \note The only legal fixed capacity a functional ALP/GraphBLAS backend
+ * may provide is one that is equal to its size.
+ *
+ * \note A backend backed by a sparse accumulator (SPA) will typically have
+ * fixed vector capacities, whereas one based on sets or other types
+ * of tree structures will typically have dynamic vector capacities.
+ * \endinternal
+ */
+ static constexpr bool fixedVectorCapacities();
+#endif
+ };
/**
* What data type should be used to store row indices.
@@ -284,6 +430,8 @@ namespace grb {
*
* \note The data type for indices of general arrays is not configurable. This
* set of implementations use size_t for those.
+ *
+ * \ingroup config
*/
typedef unsigned int RowIndexType;
@@ -295,6 +443,8 @@ namespace grb {
*
* \note The data type for indices of general arrays is not configurable. This
* set of implementations use size_t for those.
+ *
+ * \ingroup config
*/
typedef unsigned int ColIndexType;
@@ -306,6 +456,8 @@ namespace grb {
*
* \note The data type for indices of general arrays is not configurable. This
* set of implementations use size_t for those.
+ *
+ * \ingroup config
*/
typedef size_t NonzeroIndexType;
@@ -317,6 +469,8 @@ namespace grb {
*
* \note The data type for indices of general arrays is not configurable. This
* set of implementations use size_t for those.
+ *
+ * \ingroup config
*/
typedef unsigned int VectorIndexType;
diff --git a/include/graphblas/base/exec.hpp b/include/graphblas/base/exec.hpp
index 19e800e80..fefb10132 100644
--- a/include/graphblas/base/exec.hpp
+++ b/include/graphblas/base/exec.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies the #grb::Launcher functionalities.
+ *
* @author A. N. Yzelman
* @date 17th of April, 2017
*/
@@ -28,27 +32,31 @@
#include
#include
+
#ifndef _GRB_NO_STDIO
-#include
+ #include
#endif
+
namespace grb {
/**
- * The various ways in which the #Launcher can be used
- * to execute a GraphBLAS program.
+ * The various ways in which the #grb::Launcher can be used to execute an
+ * ALP program.
*
* \warning An implementation may require different linker commands
- * when using different modes. This is OK, since a call to
- * the #Launcher is required to be quite different
- * depending on which mode is used. The portability is in
- * the GraphBLAS program being launched-- that one should
- * never change depending on whichever mode it is used.
+ * when using different modes.
+ *
+ * \warning Depending on the mode given to #grb::Launcher, the parameters
+ * required for the exec function may differ.
+ *
+ * \note However, the ALP program is unaware of which mode is the launcher
+ * employs and will not have to change.
*/
enum EXEC_MODE {
/**
- * Automatic mode. The #Launcher can spawn user processes
+ * Automatic mode. The #grb::Launcher can spawn user processes
* which will execute a given program.
*/
AUTOMATIC = 0,
@@ -56,176 +64,268 @@ namespace grb {
/**
* Manual mode. The user controls \a nprocs user processes
* which together should execute a given program, by, for
- * example, using the #Launcher.
+ * example, using the #grb::Launcher.
*/
MANUAL,
/**
* When running from an MPI program. The user controls
* \a nprocs MPI programs, which, together, should execute
- * a given GraphBLAS program.
+ * a given ALP program.
*/
FROM_MPI
};
/**
- * Allows an auxiliary program to run any GraphBLAS program. Input data may be
- * passed through a user-defined type. Output data will be retrieved via the
- * same type. For implementations that support multiple user processes, the
- * caller may explicitly set the process ID and total number of user processes.
+ * A group of user processes that together execute ALP programs.
*
- * The intended use is to `just call' grb::exec which should, in its most
- * trivial form, compile regardless of which backend is selected.
+ * Allows an application to run any ALP program. Input data may be passed
+ * through a user-defined type. Output data will be retrieved via the same
+ * type.
*
- * @tparam mode Which #EXEC_MODE the Launcher should adhere to.
- * @tparam implementation Which GraphBLAS implementation is to be used.
+ * For backends that support multiple user processes, the caller may
+ * explicitly set the process ID and total number of user processes.
+ *
+ * The intended use is to `just call' the exec function, which should be
+ * accepted by any backend.
+ *
+ * @tparam mode Which #EXEC_MODE the Launcher should adhere to.
+ * @tparam backend Which backend is to be used.
*/
- template< enum EXEC_MODE mode, enum Backend implementation >
+ template< enum EXEC_MODE mode, enum Backend backend >
class Launcher {
public :
/**
- * Constructs a new Launcher. This constructor is a collective
- * call; all \a nprocs processes that form a single Launcher
- * group must make a call to this constructor at roughly the
- * same time. There is an implementation-defined time-out for
- * the creation of a Launcher group.
- *
- * @param[in] process_id The user process ID of the calling process.
- * The value must be larger or equal to 0. This
- * value must be strictly smaller than \a nprocs.
- * This value must be unique to the calling
- * process within this collective call across
- * \em all \a nprocs user processes. This number
- * \em must be strictly smaller than \a nprocs.
- * Optional: the default is 0.
- * @param[in] nprocs The total number of user processes making a
- * collective call to this function. Optional: the
- * default is 1.
- * @param[in] hostname The hostname of one of the user processes.
- * Optional: the default is `localhost'.
- * @param[in] port A free port number at \a hostname. This port
- * will be used for TCP connections to \a hostname
- * if and only if \a nprocs is larger than one.
- * Optional: the default value is `0'.
- *
- * @throws invalid_argument If #nprocs is zero.
- * @throws invalid_argument If #process_id is greater than or
- * equal to \a nprocs.
- *
- * \note An implementation may define further constraints on
- * the input arguments, such as, obviously, on \a hostname
- * and \a port, but also on \a nprocs and, as a result, on
- * \a process_id.
- */
- Launcher( const size_t process_id = 0, // user process ID
- const size_t nprocs = 1, // total number of user processes
- const std::string hostname = "localhost", // one of the user process hostnames
- const std::string port = "0" // a free port at hostname
- ) { // standard does not specify any constrants on hostname and port
- // so accept (and ignore) anything
- (void)hostname; (void)port;
+ * Constructs a new #grb::Launcher. This constructor is a collective call;
+ * all \a nprocs processes that form a single launcher group must make a
+ * simultaneous call to this constructor.
+ *
+ * There is an implementation-defined time-out for the creation of a launcher
+ * group.
+ *
+ * @param[in] process_id The user process ID of the calling process.
+ * The value must be larger or equal to 0. This
+ * value must be strictly smaller than \a nprocs.
+ * This value must be unique to the calling
+ * process within this collective call across
+ * \em all \a nprocs user processes. This number
+ * \em must be strictly smaller than \a nprocs.
+ * Optional: the default is 0.
+ * @param[in] nprocs The total number of user processes making a
+ * collective call to this function. Optional: the
+ * default is 1.
+ * @param[in] hostname The hostname of one of the user processes.
+ * Optional: the default is `localhost'.
+ * @param[in] port A free port number at \a hostname. This port
+ * will be used for TCP connections to \a hostname
+ * if and only if \a nprocs is larger than one.
+ * Optional: the default value is `0'.
+ *
+ * @throws invalid_argument If \a nprocs is zero.
+ * @throws invalid_argument If \a process_id is greater than or equal to
+ * \a nprocs.
+ *
+ * \note An implementation or backend may define further constraints on the
+ * input arguments, such as, obviously, on \a hostname and \a port, but
+ * also on \a nprocs and, as a result, on \a process_id.
+
+ * \note The most obvious is that backends supporting only one user process
+ * must not accept \a nprocs larger than 1.
+ *
+ * All aforementioned default values shall always be legal.
+ */
+ Launcher(
+ const size_t process_id = 0,
+ const size_t nprocs = 1,
+ const std::string hostname = "localhost",
+ const std::string port = "0"
+ ) {
+ // spec does not specify any constrants on hostname and port
+ // so accept (and ignore) anything
+ (void) hostname; (void) port;
#ifndef _GRB_NO_EXCEPTIONS
// sanity checks on process_id and nprocs
- if( nprocs == 0 ) { throw std::invalid_argument( "Total number of user "
- "processes must be "
- "strictly larger than "
- "zero." ); }
- if( process_id >= nprocs ) {
- throw std::invalid_argument( "Process ID must be strictly smaller than "
- "total number of user processes." );
- }
+ if( nprocs == 0 ) {
+ throw std::invalid_argument( "Total number of user processes must be "
+ "strictly larger than zero." );
+ }
+ if( process_id >= nprocs ) {
+ throw std::invalid_argument( "Process ID must be strictly smaller than "
+ "total number of user processes." );
+ }
#endif
-} // namespace grb
+ }
-/**
- * Executes the given GraphBLAS program. This function, depending on whether
- * GraphBLAS is compiled in automatic or in manual mode, will either
- * \em spawn the maximum number of available user processes or will connect
- * exactly \a nprocs existing processes, respectively, to execute the given
- * \a grb_program.
- *
- * This is a collective function call.
- *
- * @tparam T The type of the data to pass to the GraphBLAS program.
- * @tparam U The type of the output data to pass back to the user.
- *
- * @param[in] grb_program User GraphBLAS program to be executed.
- * @param[in] data_in Input data of user-defined type \a T.
- * When in automatic mode, the data will only be
- * available at user process 0 only. When in
- * manual mode, the data will be available to
- * this user process (with the below given
- * \a process_id) only.
- * @param[out] data_out Output data of user-defined type \a U. The output
- * data should be available at user process with ID
- * zero.
- * @param[in] broadcast Whether the input should be broadcast from user
- * process 0 to all other user processes. Optional;
- * the default value is \a false.
- *
- * @return SUCCESS If the execution proceeded as intended.
- * @return PANIC If an unrecoverable error was encountered while trying to
- * execute the given GraphBLAS program.
- *
- * \warning An implementation can define further constraints on the validity
- * of input arguments. The most obvious is that implementations
- * supporting only one user process will not accept \a nprocs larger
- * than 1.
- *
- * All aforementioned default values shall always be legal.
- */
-template< typename T, typename U >
-RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T & data_in,
- U & data_out, // input & output data
- const bool broadcast = false ) const {
- (void)grb_program;
- (void)data_in;
- (void)data_out;
- (void)broadcast;
- // stub implementation, should be overridden by specialised implementation,
- // so return error code
- return PANIC;
-}
+ /**
+ * Executes a given ALP program using the user processes encapsulated by this
+ * launcher group.
+ *
+ * Calling this function, depending on whether the automatic or manual/MPI
+ * mode was selected, will either \em spawn the maximum number of available
+ * user processes and \em then execute the given program, \em or it will
+ * employ the given processes that are managed by the user application and
+ * used to construct this launcher instance to execute the given
+ * \a alp_program.
+ *
+ * This is a collective function call-- all processes in the launcher group
+ * must make a simultaneous call to this function and must do so using
+ * consistent arguments.
+ *
+ * @tparam T The type of the data to pass to the ALP program as input.
+ * @tparam U The type of the output data to pass back to the caller.
+ *
+ * @param[in] alp_program The user program to be executed.
+ * @param[in] data_in Input data of user-defined type \a T.
+ *
+ * When in automatic mode and \a broadcast is false , the data will
+ * only be available at user process with ID 0. When in automatic mode and
+ * \a broadcast is true , the data will be available at all user
+ * processes. When in manual mode, the data will be available to this user
+ * process only, with "this process" corresponding to the process that calls
+ * this function.
+ *
+ * @param[out] data_out Output data of user-defined type \a U. The output
+ * data should be available at user process with ID
+ * zero.
+ * @param[in] broadcast Whether the input should be broadcast from user
+ * process 0 to all other user processes. Optional;
+ * the default value is \a false.
+ *
+ * @return #grb::SUCCESS If the execution proceeded as intended.
+ * @return #grb::PANIC If an unrecoverable error was encountered while
+ * attempting to execute, attempting to terminate, or
+ * while executing, the given program.
+ *
+ * \warning Even if #grb::SUCCESS is returned, an algorithm may fail to
+ * achieve its intended result-- for example, an iterative solver
+ * may fail to converge. A good programming pattern has that \a U
+ * either a) is an error code for the algorithm used (e.g.,
+ * #grb::RC), or b) that \a U contains such an error code.
+ */
+ template< typename T, typename U >
+ RC exec(
+ void ( *alp_program )( const T &, U & ),
+ const T &data_in,
+ U &data_out,
+ const bool broadcast = false
+ ) const {
+ (void) alp_program;
+ (void) data_in;
+ (void) data_out;
+ (void) broadcast;
+ // stub implementation, should be overridden by specialised backend,
+ // so return error code
+ return PANIC;
+ }
-/**
- * Variable size version of the above function.
- *
- * @param[in] broadcast Whether the input should be broadcast from user
- * process 0 to all other user processes. Optional;
- * the default value is \a false. This will let user
- * processes with ID larger than zero allocate
- * \a in_size bytes of memory into which the data at
- * process 0 will be copied.
- *
- * \todo more documentation
- */
-template< typename U >
-RC exec( void ( *grb_program )( const void *, const size_t, U & ), const void * data_in, const size_t in_size, U & data_out, const bool broadcast = false ) const {
- (void)grb_program;
- (void)data_in;
- (void)in_size;
- (void)data_out;
- (void)broadcast;
- return PANIC;
-}
+ /**
+ * Executes a given ALP program using the user processes encapsulated by this
+ * launcher group.
+ *
+ * This variant of exec has that \a data_in is of a variable byte size,
+ * instead of a fixed POD type. If \a broadcast is true and the
+ * launcher is instantiated using the #grb::AUTOMATIC mode, all bytes are
+ * broadcast to all user processes.
+ *
+ * @param[in] alp_program The user program to be executed.
+ * @param[in] data_in Pointer to raw input byte data.
+ * @param[in] in_size The number of bytes the input data consists of.
+ * @param[out] data_out Output data of user-defined type \a U. The output
+ * data should be available at user process with ID
+ * zero.
+ * @param[in] broadcast Whether the input should be broadcast from user
+ * process 0 to all other user processes. Optional;
+ * the default value is \a false.
+ *
+ * @return #grb::SUCCESS If the execution proceeded as intended.
+ * @return #grb::PANIC If an unrecoverable error was encountered while
+ * attempting to execute, attempting to terminate, or
+ * while executing, the given program.
+ *
+ * For more details, see the other version of this function.
+ */
+ template< typename U >
+ RC exec(
+ void ( *alp_program )( const void *, const size_t, U & ),
+ const void * data_in,
+ const size_t in_size,
+ U &data_out,
+ const bool broadcast = false
+ ) const {
+ (void) alp_program;
+ (void) data_in;
+ (void) in_size;
+ (void) data_out;
+ (void) broadcast;
+ return PANIC;
+ }
-/**
- * Releases all GraphBLAS resources. After a call to this function, no
- * GraphBLAS library functions may be called any longer.
- *
- * @return SUCCESS A call to this function may never fail.
- */
-static RC finalize() {
- return PANIC;
-}
-}
-; // end class `Launcher'
+ /**
+ * Releases all ALP resources.
+ *
+ * After a call to this function, no further ALP programs may launched using
+ * the #grb::Launcher and #grb::Benchmarker. Also the use of #grb::init and
+ * #grb::finalize will no longer be accepted.
+ *
+ * \warning #grb::init and #grb::finalize are deprecated.
+ *
+ * \internal
+ * \todo Remove the above comments once #grb::init and #grb::finalize are
+ * moved to an internal namespace.
+ * \endinternal
+ *
+ * After a call to this function, the only way to once again run ALP programs
+ * is to use the #grb::Launcher from a new process.
+ *
+ * \warning Therefore, use this function with care and preferably only just
+ * before exiting the process.
+
+ * A well-behaving program calls this function, or
+ * #grb::Benchmarker::finalize, exactly once before its process terminates,
+ * or just after the guaranteed last invocation of an ALP program.
+ *
+ * @return #grb::SUCCESS The resources have successfully and permanently been
+ * released.
+ * @return #grb::PANIC An unrecoverable error has been encountered and the
+ * user program is encouraged to exit as quickly as
+ * possible. The state of the ALP library has become
+ * undefined and should no longer be used.
+ *
+ * \note In the terminology of the Message Passing Interface (MPI), this
+ * function is the ALP equivalent of the MPI_Finalize() .
+ *
+ * \note In #grb::AUTOMATIC mode when using a parallel backend that uses MPI
+ * to auto-parallelise the ALP computations, MPI is never explicitly
+ * exposed to the user application. This use case necessitates the
+ * specification of this function.
+ *
+ * \note Thus, and in particular, an ALP program launched in #grb::AUTOMATIC
+ * mode while using the #grb::BSP1D or the #grb::hybrid backends with
+ * ALP compiled using LPF that in turn is configured to use an
+ * MPI-based engine, should make sure to call this function before
+ * program exit.
+ *
+ * \note An application that launches ALP programs in #grb::FROM_MPI mode
+ * must still call this function, even though a proper such application
+ * makes its own call to MPI_Finalize() . This does \em not
+ * induce improper behaviour since calling this function using a
+ * launcher instance in #grb::FROM_MPI mode translates, from an MPI
+ * perspective, to a no-op.
+ *
+ * \internal This is the base implementation that should be specialised by
+ * each backend separately.
+ */
+ static RC finalize() {
+ return PANIC;
+ }
+
+ }; // end class `Launcher'
} // end namespace ``grb''
#endif // end _H_GRB_EXEC_BASE
+
diff --git a/include/graphblas/base/init.hpp b/include/graphblas/base/init.hpp
index ec6ca2529..285f9962b 100644
--- a/include/graphblas/base/init.hpp
+++ b/include/graphblas/base/init.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies the #grb::init and #grb::finalize functionalities.
+ *
* @author A. N. Yzelman
* @date 24th of January, 2017
*/
@@ -33,75 +37,85 @@ namespace grb {
/**
* Initialises the calling user process.
*
- * \deprecated Please use grb::Launcher instead. This primitive will be
+ * \deprecated Please use #grb::Launcher instead. This primitive will be
* removed from verson 1.0 onwards.
*
+ * @tparam backend Which GraphBLAS backend this call to init initialises.
+ *
+ * By default, the backend that is selected by the user at compile-time is
+ * used. If no backend was selected, #grb::reference is assumed.
+ *
+ * @param[in] s The ID of this user process.
+ * @param[in] P The total number of user processes.
+ *
* If the backend supports multiple user processes, the user can invoke this
* function with \a P equal to one or higher; if the backend supports only a
* single user process, then \a P must equal one.
+ *
* The value for the user process ID \a s must be larger or equal to zero and
* must be strictly smaller than \a P. If \a P > 1, each user process must
* call this function collectively, each user process should pass the same
* value for \a P, and each user process should pass a unique value for \a s
* amongst all \a P collective calls made.
*
+ * @param[in] implementation_data Any implementation-defined data structure
+ * required for successful completion of this
+ * call.
+ *
* An implementation may define that additional data is required for a call to
* this function to complete successfully. Such data may be passed via the
* final argument to this function, \a implementation_data.
*
* If the implementation does not support multiple user processes, then a
- * value for \a implementation_data shall not be required. In parcticular, a
+ * value for \a implementation_data shall not be required. In particular, a
* call to this function with an empty parameter list shall then be legal
* and infer the following default arguments: zero for \a s, one for \a P,
* and \a NULL for \a implementation_data. When such an implementation is
- * requested to initialise multiple user processes, the grb::UNSUPPORTED
- * error code shall be returned.
- *
- * A call to this function must be matched with a call to grb::finalize().
- * After a successful call to this function, a new call to grb::init() without
- * first calling grb::finalize() shall incur undefined behaviour. The
- * construction of GraphBLAS containers without a preceding successful call
- * to grb::init() will result in invalid GraphBLAS objects. Any valid
- * GraphBLAS containers will become invalid after a call to grb::finalize().
- * Any use of GraphBLAS functions on invalid containers will result in
- * undefined behaviour.
- *
- * @tparam backend Which GraphBLAS backend this call to init initialises.
+ * requested to initialise multiple user processes, then #grb::UNSUPPORTED
+ * shall be returned.
*
- * @param[in] s The ID of this user process.
- * @param[in] P The total number of user processes.
- * @param[in] implementation_data Any implementation-defined data structure
- * required for successful completion of this
- * call.
+ * A call to this function must be matched with a call to #grb::finalize.
+ * After a successful call to this function, a new call to #grb::init without
+ * first calling #grb::finalize shall incur undefined behaviour. The
+ * construction of ALP/GraphBLAS containers without a preceding successful call
+ * to #grb::init will result in undefined behaviour. Any valid GraphBLAS
+ * containers will become invalid after a call to #grb::finalize.
*
+ * \internal
* \note For a pure MPI implementation, for instance, \a implementation_data
* may be a pointer to the MPI communicator corresponding to these user
* processes.
*
- * \note The implementations based on PlatformBSP require direct passing of
- * the \a bsp_t corresponding to the BSP context of the user processes;
- * this is legal since the PlatformBSP specification defines the
- * \a bsp_t type as a void pointer.
+ * \note The implementations based on LPF require direct passing of the
+ * \a lpf_t corresponding to the BSP context of the user processes;
+ * this is legal since the LPF defines the \a lpf_t type as a void
+ * pointer.
+ * \endinternal
*
* @return SUCCESS If the initialisation was successful.
* @return UNSUPPORTED When the implementation does not support multiple
- * user processes (\a P larger than 1). After a call to
- * this function exits with this error code the library
- * state shall be as though the call never were made.
- * @return PANIC If this function fails, the state of this GraphBLAS
- * implementation becomes undefined.
+ * user processes while the given \a P was larger than 1.
+ * @return PANIC If returned, the state of the ALP library becomes
+ * undefined.
+ *
+ * After a call to this function that exits with a non-SUCCESS and non-PANIC
+ * error code, the program shall behave as though the call were never made.
*
* \note There is no argument checking. If \a s is larger or equal to \a P,
* undefined behaviour occurs. If \a implementation_data was invalid
* or corrupted, undefined behaviour occurs.
*
+ * \internal
+ * \todo Define #grb::ILLEGAL to be returned if \f$ s \geq P \f$.
+ * \endinternal
+ *
* \par Performance semantics
- * None. Implementations are encouraged to specify the complexity of
- * their implementation of this function in terms of \a P.
+ * Implementations and backends must specify the complexity of this
+ * function in terms of \a P.
*
* \note Compared to the GraphBLAS C specification, this function lacks a
* choice whether to execute in `blocking' or `non-blocking' mode. With
- * ALP/GraphBLAS, the backend controls whether execution proceeds in a
+ * ALP, the selected backend controls whether execution proceeds in a
* non-blocking manner or not. Thus selecting a blocking backend for
* compilation results in the application of blocking semantics, while
* selecting a non-blocking backend results in the application of non-
@@ -110,16 +124,22 @@ namespace grb {
* valid implementation of a non-blocking mode. Therefore, this
* specification will still yield a valid C API implementation when
* properly wrapping around a blocking ALP/GraphBLAS backend.
- * \note This specification allows for grb::init() to be called multiple
- * times from the same process and the same thread. The parameters \a s
- * and \a P (and \a implementation_data) may differ each time. Each
- * (repeated) call must of course meet all the above requirements.
+ * \note This specification allows for #grb::init to be called multiple times
+ * from the same process and the same thread. The parameters \a s and
+ * \a P (and \a implementation_data) may differ each time. Each
+ * (repeated) call must of course continue to meet all the above
+ * requirements.
* \note The GraphBLAS C API does not have the notion of user processes. We
* believe this notion is necessary to properly integrate into parallel
* frameworks, and also to affect proper and efficient parallel I/O.
*
* \warning This primitive has been deprecated since version 0.5. Please update
- * your code to use the grb::Launcher instead.
+ * your code to use the #grb::Launcher instead.
+ *
+ * \internal The implementation will be retained after deprecation has been
+ * pushed through, as the #grb::Launcher depends on it. However, the
+ * #grb::init and #grb::finalize must then be moved into the
+ * #grb::internal namespace.
*/
template< enum Backend backend = config::default_backend >
RC init( const size_t s, const size_t P, void * const implementation_data ) {
@@ -132,42 +152,42 @@ namespace grb {
/**
* Initialises the calling user process.
*
- * \deprecated Please use grb::Launcher instead. This primitive will be
+ * \deprecated Please use #grb::Launcher instead. This primitive will be
* removed from verson 1.0 onwards.
*
* This variant takes no input arguments. It will assume a single user process
* exists; i.e., the call is equivalent to one to #grb::init with \a s zero
- * and \a P one.
+ * and \a P one (and \a implementation_data NULL ).
*
* @tparam backend The backend implementation to initialise.
*
* @return SUCCESS If the initialisation was successful.
- * @return PANIC If this function fails, the state of this GraphBLAS
- * implementation becomes undefined.
+ * @return PANIC If returned, the state of the ALP library becomes
+ * undefined.
*
* \warning This primitive has been deprecated since version 0.5. Please update
- * your code to use the grb::Launcher instead.
+ * your code to use the #grb::Launcher instead.
*/
template< enum Backend backend = config::default_backend >
RC init() {
- return grb::init< backend >( 0, 1, NULL );
+ return grb::init< backend >( 0, 1, nullptr );
}
/**
- * Finalises an ALP/GraphBLAS context opened by the last call to grb::init().
+ * Finalises an ALP/GraphBLAS context opened by the last call to #grb::init.
*
- * \deprecated Please use grb::Launcher instead. This primitive will be
+ * \deprecated Please use #grb::Launcher instead. This primitive will be
* removed from verson 1.0 onwards.
*
* This function must be called collectively and must follow a call to
- * grb::init(). After successful execution of this function, a new call
- * to grb::init() may be made.
+ * #grb::init. After successful execution of this function, a new call to
+ * #grb::init may be made. (This function is re-entrant.)
*
* After a call to this function, any ALP/GraphBLAS objects that remain in
* scope become invalid.
*
* \warning Invalid ALP/GraphBLAS containers will remain invalid no matter if a
- * next call to grb::init() is made.
+ * next call to #grb::init is made.
*
* @tparam backend Which ALP/GraphBLAS backend to finalise.
*
@@ -176,15 +196,15 @@ namespace grb {
* implementation becomes undefined. This means none of its
* functions should be called during the remainder program
* execution; in particular this means a new call to
- * grb::init() will not remedy the situaiton.
+ * #grb::init will not remedy the situation.
*
* \par Performance semantics
* None. Implementations are encouraged to specify the complexity of
* their implementation of this function in terms of the parameter
- * \a P the matching call to grb::init() was called with.
+ * \a P the matching call to #grb::init was called with.
*
* \warning This primitive has been deprecated since version 0.5. Please update
- * your code to use the grb::Launcher instead.
+ * your code to use the #grb::Launcher instead.
*/
template< enum Backend backend = config::default_backend >
RC finalize() {
diff --git a/include/graphblas/base/internalops.hpp b/include/graphblas/base/internalops.hpp
index 1ec5ce508..668534da9 100644
--- a/include/graphblas/base/internalops.hpp
+++ b/include/graphblas/base/internalops.hpp
@@ -54,85 +54,94 @@ namespace grb {
class argmin {
static_assert( std::is_integral< IType >::value,
- "Argmin operator may only be constructed using integral index "
- "types." );
-
- public:
- /** Alias to the left-hand input data type. */
- typedef std::pair< IType, VType > left_type;
-
- /** Alias to the right-hand input data type. */
- typedef std::pair< IType, VType > right_type;
-
- /** Alias to the output data type. */
- typedef std::pair< IType, VType > result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply(
- const left_type * __restrict__ const a,
- const right_type * __restrict__ const b,
- result_type * __restrict__ const c
- ) {
- if( a->second < b->second ) {
- *c = *a;
- } else {
- *c = *b;
+ "Argmin operator may only be constructed using integral index types." );
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef std::pair< IType, VType > left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef std::pair< IType, VType > right_type;
+
+ /** Alias to the output data type. */
+ typedef std::pair< IType, VType > result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( a->second < b->second ) {
+ *c = *a;
+ } else {
+ *c = *b;
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( a->second < c->second ) {
+ c->first = a->first;
+ c->second = a->second;
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( b->second <= c->second ) {
+ c->first = b->first;
+ c->second = b->second;
+ }
}
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( a->second < c->second ) {
- c->first = a->first;
- c->second = a->second;
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( b->second <= c->second ) {
- c->first = b->first;
- c->second = b->second;
- }
- }
+
};
/**
@@ -148,85 +157,92 @@ namespace grb {
class argmax {
static_assert( std::is_integral< IType >::value,
- "Argmin operator may only be constructed using integral index "
- "types." );
-
- public:
- /** Alias to the left-hand input data type. */
- typedef std::pair< IType, VType > left_type;
-
- /** Alias to the right-hand input data type. */
- typedef std::pair< IType, VType > right_type;
-
- /** Alias to the output data type. */
- typedef std::pair< IType, VType > result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply(
- const left_type * __restrict__ const a,
- const right_type * __restrict__ const b,
- result_type * __restrict__ const c
- ) {
- if( a->second > b->second ) {
- *c = *a;
- } else {
- *c = *b;
+ "Argmin operator may only be constructed using integral index types." );
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef std::pair< IType, VType > left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef std::pair< IType, VType > right_type;
+
+ /** Alias to the output data type. */
+ typedef std::pair< IType, VType > result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( a->second > b->second ) {
+ *c = *a;
+ } else {
+ *c = *b;
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( a->second > c->second ) {
+ c->first = a->first;
+ c->second = a->second;
+ }
}
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( a->second > c->second ) {
- c->first = a->first;
- c->second = a->second;
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( b->second >= c->second ) {
- c->first = b->first;
- c->second = b->second;
- }
- }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( b->second >= c->second ) {
+ c->first = b->first;
+ c->second = b->second;
+ }
+ }
+
};
/**
@@ -256,73 +272,88 @@ namespace grb {
* @tparam IN2 The right-hand input data type.
* @tparam OUT The output data type.
*/
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
class left_assign {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of the addition c = a.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- (void)b;
- *c = static_cast< result_type >( *a );
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c = static_cast< result_type >( *a );
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- (void)b;
- (void)c;
- }
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of the addition c = a.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ (void) b;
+ *c = static_cast< result_type >( *a );
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c = static_cast< result_type >( *a );
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ (void) b;
+ (void) c;
+ }
+
};
/**
@@ -352,73 +383,90 @@ namespace grb {
* @tparam IN2 The right-hand input data type.
* @tparam OUT The output data type.
*/
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
class right_assign {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of the addition c = a.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- (void)a;
- *c = *b;
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- (void)a;
- (void)c;
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- *c = static_cast< result_type >( *b );
- }
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of the addition c = a.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ (void) a;
+ *c = *b;
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ (void) a;
+ (void) c;
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ *c = static_cast< result_type >( *b );
+ }
+
};
/**
@@ -429,77 +477,94 @@ namespace grb {
*
* If \f$ x \f$ does not evaluate true the operator shall have no effect.
*/
- template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+ template<
+ typename D1, typename D2, typename D3,
+ enum Backend implementation = config::default_backend
+ >
class left_assign_if {
- public:
- /** Alias to the left-hand input data type. */
- typedef D1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef D2 right_type;
-
- /** Alias to the output data type. */
- typedef D3 result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the addition c = a.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
- if( static_cast< const bool >( *b ) ) {
- *c = *a;
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
- if( static_cast< const bool >( *c ) ) {
- *c = static_cast< D3 >( *a );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
- if( static_cast< bool >( *b ) ) {
- *c = static_cast< D3 >( static_cast< D1 >( *c ) );
- }
- }
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef D1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef D2 right_type;
+
+ /** Alias to the output data type. */
+ typedef D3 result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the addition c = a.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const D1 * __restrict__ const a,
+ const D2 * __restrict__ const b,
+ D3 * __restrict__ const c
+ ) {
+ if( static_cast< const bool >( *b ) ) {
+ *c = *a;
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const D1 * __restrict__ const a,
+ D3 * __restrict__ const c
+ ) {
+ if( static_cast< const bool >( *c ) ) {
+ *c = static_cast< D3 >( *a );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ D3 * __restrict__ const c,
+ const D2 * __restrict__ const b
+ ) {
+ if( static_cast< bool >( *b ) ) {
+ *c = static_cast< D3 >( static_cast< D1 >( *c ) );
+ }
+ }
+
};
/**
@@ -510,77 +575,94 @@ namespace grb {
*
* If \f$ x \f$ does not evaluate true the operator shall have no effect.
*/
- template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
+ template<
+ typename D1, typename D2, typename D3,
+ enum Backend implementation = config::default_backend
+ >
class right_assign_if {
- public:
- /** Alias to the left-hand input data type. */
- typedef D1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef D2 right_type;
-
- /** Alias to the output data type. */
- typedef D3 result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the addition c = a.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- */
- static void apply( const D1 * __restrict__ const a, const D2 * __restrict__ const b, D3 * __restrict__ const c ) {
- if( static_cast< const bool >( *a ) ) {
- *c = *b;
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef D1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef D2 right_type;
+
+ /** Alias to the output data type. */
+ typedef D3 result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the addition c = a.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ */
+ static void apply(
+ const D1 * __restrict__ const a,
+ const D2 * __restrict__ const b,
+ D3 * __restrict__ const c
+ ) {
+ if( static_cast< const bool >( *a ) ) {
+ *c = *b;
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const D1 * __restrict__ const a,
+ D3 * __restrict__ const c
+ ) {
+ if( static_cast< const bool >( *a ) ) {
+ *c = static_cast< D3 >( static_cast< D2 >( *c ) );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ D3 * __restrict__ const c,
+ const D2 * __restrict__ const b
+ ) {
+ if( static_cast< bool >( *c ) ) {
+ *c = static_cast< D3 >( *b );
+ }
}
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const D1 * __restrict__ const a, D3 * __restrict__ const c ) {
- if( static_cast< const bool >( *a ) ) {
- *c = static_cast< D3 >( static_cast< D2 >( *c ) );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( D3 * __restrict__ const c, const D2 * __restrict__ const b ) {
- if( static_cast< bool >( *c ) ) {
- *c = static_cast< D3 >( *b );
- }
- }
+
};
/**
@@ -603,92 +685,102 @@ namespace grb {
* @tparam OUT The output data type.
*/
// [Example Base Operator Implementation]
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend >
class add {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an inplace foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an inplace foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the addition c = a + b.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * \warning Passing invalid pointers will result in UB.
- */
- static void apply( const left_type * __restrict__ const a,
- const right_type * __restrict__ const b,
- result_type * __restrict__ const c
- ) {
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
- // see internal issue 306 for rationale
- *c = *a + *b;
- GRB_UTIL_RESTORE_WARNINGS
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- *
- * \warning Passing invalid pointers will result in UB.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
- // see internal issue 306 for rationale
- *c += *a;
- GRB_UTIL_RESTORE_WARNINGS
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- *
- * \warning Passing invalid pointers will result in UB.
- */
- static void foldl(
- result_type * __restrict__ const c,
- const right_type * __restrict__ const b
- ) {
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
- // see internal issue 306 for rationale
- *c += *b;
- GRB_UTIL_RESTORE_WARNINGS
- }
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an inplace foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an inplace foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the addition c = a + b.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * \warning Passing invalid pointers will result in UB.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see internal issue 306 for rationale
+ *c = *a + *b;
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ *
+ * \warning Passing invalid pointers will result in UB.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see internal issue 306 for rationale
+ *c += *a;
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ *
+ * \warning Passing invalid pointers will result in UB.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see internal issue 306 for rationale
+ *c += *b;
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
};
// [Example Base Operator Implementation]
@@ -707,1131 +799,1898 @@ namespace grb {
* explicit definition as a GraphBLAS operator with the #is_associative and
* #is_commutative fields, and others, set as required.
*
- * @tparam IN1 The left-hand input data type.
- * @tparam IN2 The right-hand input data type.
- * @tparam OUT The output data type.
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class mul {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the multiplication c = a * b.
+ *
+ * @param[in] a Pointer to the left-hand side input. Must be initialised.
+ * @param[in] b Pointer to the right-hand side input. Must be initialised.
+ * @param[out] c Pointer to where to compute the output.
+ *
+ * \warning All pointers must be valid or UB occurs.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see internal issue 306 for rationale
+ *c = *a * *b;
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c *= *a;
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ *c *= *b;
+ }
+
+ };
+
+ /**
+ * Standard max operator.
+ *
+ * Assumes native availability of < on the given data types, or assumes
+ * the relevant operators are properly overloaded.
+ *
+ * Non-standard or non-matching data types, or non-standard (overloaded) <
+ * operators, should be used with caution and may necessitate an explicit
+ * definition as a GraphBLAS operator with the #is_associative and
+ * #is_commutative fields, and others, set as required.
+ *
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class max {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the max operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \max\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *b ) {
+ *c = static_cast< OUT >( *b );
+ } else {
+ *c = static_cast< OUT >( *a );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a > *c ) {
+ *c = *a;
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b > *c ) {
+ *c = *b;
+ }
+ }
+
+ };
+
+ /**
+ * Standard min operator.
+ *
+ * Assumes native availability of > on the given data types, or assumes
+ * the relevant operators are properly overloaded.
+ *
+ * Non-standard or non-matching data types, or non-standard (overloaded) >
+ * operators, should be used with caution and may necessitate an explicit
+ * definition as a GraphBLAS operator with the #is_associative and
+ * #is_commutative fields, and others, set as required.
+ *
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class min {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of the min operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a > *b ) {
+ *c = static_cast< OUT >( *b );
+ } else {
+ *c = static_cast< OUT >( *a );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *c ) {
+ *c = *a;
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b < *c ) {
+ *c = *b;
+ }
+ }
+
+ };
+
+ /**
+ * Standard numerical subtraction operator.
+ *
+ * Assumes native availability of - on the given data types, or assumes
+ * that the relevant operators are properly overloaded.
+ *
+ * Non-standard or non-matching data types, or non-standard (overloaded) -
+ * operators, should be used with caution and may necessitate an explicit
+ * definition as a GraphBLAS operator with the #is_associative and
+ * #is_commutative fields, and others, set as required.
+ *
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class substract {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = *a - *b;
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c = *a - *c;
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ *c -= *b;
+ }
+
+ };
+
+ /**
+ * Standard numerical division operator.
+ *
+ * Assumes native availability of / on the given data types, or assumes
+ * that the relevant operators are properly overloaded.
+ *
+ * Non-standard or non-matching data types, or non-standard (overloaded) /
+ * operators, should be used with caution and may necessitate an explicit
+ * definition as a GraphBLAS operator with the #is_associative and
+ * #is_commutative fields, and others, set as required.
+ *
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class divide {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = a/b \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = *a / *b;
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c = *a / *c;
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ *c /= *b;
+ }
+
+ };
+
+ /**
+ * Non-standard numerical division where the inputs are switched.
+ *
+ * I.e., if the left input is \f$ l \f$ and the right input is \f$ r \f$,
+ * then this operator computes \f$ r / l \f$.
+ *
+ * Assumes native availability of / on the given data types, or assumes
+ * that the relevant operators are properly overloaded.
+ *
+ * Non-standard or non-matching data types, or non-standard (overloaded) /
+ * operators, should be used with caution and may necessitate an explicit
+ * definition as a GraphBLAS operator with the #is_associative and
+ * #is_commutative fields, and others, set as required.
+ *
+ * @tparam IN1 The left-hand input data type.
+ * @tparam IN2 The right-hand input data type.
+ * @tparam OUT The output data type.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class divide_reverse {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = b/a \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = *b / *a;
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c /= *a;
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ *c = *b / *c;
+ }
+
+ };
+
+ /**
+ * The equals operator.
+ *
+ * Assumes that the == operator for the given input types is defined.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class equal {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c \f$ will be set to
+ * static_cast(true) if \f$ a == b \f$, and to
+ * static_cast(false) otherwise.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a == *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a == *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b == *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ };
+
+ /**
+ * Standard not-equals operator.
+ *
+ * Assumes that the != operator is defined on the given input types.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class not_equal {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see internal issue 306 for rationale
+ if( *a != *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a != *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b != *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ };
+
+ /**
+ * A non-standard operator that returns any input that evaluates to
+ * true when cast to a bool , \em or, in case that
+ * no input evaluates true , returns any input.
+ *
+ * In case the input and output types are bool , this operator
+ * corresponds to the classical logical or.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class any_or {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a ) {
+ *c = static_cast< OUT >( *a );
+ } else if( *b ) {
+ *c = static_cast< OUT >( *b );
+ } else {
+ assert( !( *a ) );
+ *c = static_cast< OUT >( *a );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a ) {
+ *c = static_cast< result_type >( *a );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b ) {
+ *c = static_cast< result_type >( *b );
+ }
+ }
+
+ };
+
+ /**
+ * The logical or operator, \f$ x \lor y \f$.
+ *
+ * Assumes that the || operator is defined on the given input types.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class logical_or {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ printf( "Hello from mul\n" );
+ if( *a || *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a || *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b || *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ };
+
+ /**
+ * The logical-and operator, \f$ x \land y \f$.
+ *
+ * Assumes that the && operator is defined for the given input types.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class logical_and {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a && *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a && *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
+ // see docs/Suppressions.md
+ if( *b && *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+
+ };
+
+ /**
+ * Absolute difference operator, \f$ |x-y| \f$.
+ *
+ * Assumes that the - and < operators are defined for the given input
+ * types.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class abs_diff {
+
+ public:
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = \min\{a,b\} \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *b ) {
+ *c = static_cast< OUT >( *b - *a );
+ } else {
+ *c = static_cast< OUT >( *a - *b );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *c ) {
+ *c -= *a;
+ } else {
+ *c = static_cast< OUT >( *a - *c );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b < *c ) {
+ *c -= *b;
+ } else {
+ *c = static_cast< OUT >( *b - *c );
+ }
+ }
+
+ };
+
+ /**
+ * ReLU operator as commonly used in machine learning, except it is here
+ * interpreted as a binary operator.
+ *
+ * The inputs to this binary function are assumed to be the threshold value
+ * and the input signal.
+ *
+ * ReLU is in fact functionally equal to #grb::operators::min.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class relu {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = true;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = true;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = ReLU\{a,b\} = \begin{cases}
+ * a \text{, if } a>b \\
+ * b \text{, otherwise}
+ * \end{cases}\f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *b ) {
+ *c = static_cast< OUT >( *b );
+ } else {
+ *c = static_cast< OUT >( *a );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a > *c ) {
+ *c = *a;
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b > *c ) {
+ *c = *b;
+ }
+ }
+
+ };
+
+ /**
+ * Square difference operator: \f$ (x-y)^2 \f$.
+ *
+ * Assumes that the - and * operators are defined on the given input types.
+ */
+ template<
+ typename D1, typename D2, typename D3,
+ enum Backend implementation = config::default_backend
+ >
+ class square_diff {
+
+ public:
+
+ typedef D1 left_type;
+ typedef D2 right_type;
+ typedef D3 result_type;
+
+ static constexpr bool has_foldl = true;
+ static constexpr bool has_foldr = true;
+ static constexpr bool is_associative = false;
+ static constexpr bool is_commutative = true;
+
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = ( *a - *b ) * ( *a - *b );
+ }
+
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ *c = ( *a - *c ) * ( *a - *c );
+ }
+
+ static void foldl(
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = ( *c - *b ) * ( *c - *b );
+ }
+
+ };
+
+ /**
+ * Zips two inputs into a pair.
+ *
+ * @tparam IN1 Left operand type.
+ * @tparam IN2 Right operand type.
+ *
+ * The result type is fixed at std::pair< IN1, IN2 > .
+ *
+ * May be used together with argmin and other operators defined on pairs.
+ */
+ template<
+ typename IN1, typename IN2,
+ enum Backend implementation = config::default_backend
+ >
+ class zip {
+
+ public:
+
+ typedef IN1 left_type;
+ typedef IN2 right_type;
+ typedef std::pair< IN1, IN2 > result_type;
+
+ static constexpr bool has_foldl = false;
+ static constexpr bool has_foldr = false;
+ static constexpr bool is_associative = false;
+ static constexpr bool is_commutative = false;
+
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ *c = std::make_pair( *a, *b );
+ }
+
+ };
+
+ /**
+ * Whether the first argument of two given pairs compare equal.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class equal_first {
+
+ public:
+
+ typedef IN1 left_type;
+ typedef IN2 right_type;
+ typedef OUT result_type;
+
+ static constexpr bool has_foldl = false;
+ static constexpr bool has_foldr = false;
+ static constexpr bool is_associative = false;
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c = a->first == b->first \f$.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( a->first == b->first ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
+
+ };
+
+ /**
+ * The less-than operator.
+ *
+ * Assumes that the < operator for the given input types is defined.
*/
template<
typename IN1, typename IN2, typename OUT,
enum Backend implementation = config::default_backend
>
- class mul {
-
- public:
-
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the multiplication c = a * b.
- *
- * @param[in] a Pointer to the left-hand side input. Must be initialised.
- * @param[in] b Pointer to the right-hand side input. Must be initialised.
- * @param[out] c Pointer to where to compute the output.
- *
- * \warning All pointers must be valid or UB occurs.
- */
- static void apply(
+ class lt {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c \f$ will be set to
+ * static_cast(true) if \f$ a < b \f$, and to
+ * static_cast(false) otherwise.
+ */
+ static void apply(
const left_type * __restrict__ const a,
const right_type * __restrict__ const b,
result_type * __restrict__ const c
- ) {
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
- // see internal issue 306 for rationale
- *c = *a * *b;
- GRB_UTIL_RESTORE_WARNINGS
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c *= *a;
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- *c *= *b;
- }
- };
+ ) {
+ if( *a < *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
- /**
- * Standard max operator.
- *
- * Assumes native availability of < on the given data types, or assumes
- * the relevant operators are properly overloaded.
- *
- * Non-standard or non-matching data types, or non-standard (overloaded) <
- * operators, should be used with caution and may necessitate an explicit
- * definition as a GraphBLAS operator with the #is_associative and
- * #is_commutative fields, and others, set as required.
- *
- * @tparam IN1 The left-hand input data type.
- * @tparam IN2 The right-hand input data type.
- * @tparam OUT The output data type.
- */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class max {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the max operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \max\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a < *b ) {
- *c = static_cast< OUT >( *b );
- } else {
- *c = static_cast< OUT >( *a );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a > *c ) {
- *c = *a;
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b > *c ) {
- *c = *b;
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a < *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b < *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
}
- }
+
};
/**
- * Standard min operator.
- *
- * Assumes native availability of > on the given data types, or assumes
- * the relevant operators are properly overloaded.
- *
- * Non-standard or non-matching data types, or non-standard (overloaded) >
- * operators, should be used with caution and may necessitate an explicit
- * definition as a GraphBLAS operator with the #is_associative and
- * #is_commutative fields, and others, set as required.
+ * The greater-than operator.
*
- * @tparam IN1 The left-hand input data type.
- * @tparam IN2 The right-hand input data type.
- * @tparam OUT The output data type.
+ * Assumes that the > operator for the given input types is defined.
*/
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class min {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of the min operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a > *b ) {
- *c = static_cast< OUT >( *b );
- } else {
- *c = static_cast< OUT >( *a );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a < *c ) {
- *c = *a;
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b < *c ) {
- *c = *b;
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class gt {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c \f$ will be set to
+ * static_cast(true) if \f$ a > b \f$, and to
+ * static_cast(false) otherwise.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a > *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
}
- }
- };
-
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class substract {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = false;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = *a - *b;
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c = *a - *c;
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- *c -= *b;
- }
- };
-
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class divide {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = false;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = a/b \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = *a / *b;
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c = *a / *c;
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- *c /= *b;
- }
- };
-
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class divide_reverse {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = false;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = b/a \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = *b / *a;
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c /= *a;
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- *c = *b / *c;
- }
- };
-
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class equal {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a == *b ) {
- *c = static_cast< OUT >( true );
- } else {
- *c = static_cast< OUT >( false );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a == *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b == *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
- };
-
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class not_equal {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // this is a (too) broad suppression--
- // see internal issue 306 for rationale
- if( *a != *b ) {
- *c = static_cast< OUT >( true );
- } else {
- *c = static_cast< OUT >( false );
- }
- GRB_UTIL_RESTORE_WARNINGS
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a != *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b != *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
- };
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class any_or {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a ) {
- *c = static_cast< OUT >( *a );
- } else if( *b ) {
- *c = static_cast< OUT >( *b );
- } else {
- assert( ! ( *a ) );
- *c = static_cast< OUT >( *a );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a ) {
- *c = static_cast< result_type >( *a );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b ) {
- *c = static_cast< result_type >( *b );
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a > *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
}
- }
- };
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class logical_or {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- printf( "Hello from mul\n" );
- if( *a || *b ) {
- *c = static_cast< OUT >( true );
- } else {
- *c = static_cast< OUT >( false );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a || *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b || *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
- };
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b > *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class logical_and {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a && *b ) {
- *c = static_cast< OUT >( true );
- } else {
- *c = static_cast< OUT >( false );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a && *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b && *c ) {
- *c = static_cast< result_type >( true );
- } else {
- *c = static_cast< result_type >( false );
- }
- }
};
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class abs_diff {
+ /**
+ * The less-than-or-equal operator.
+ *
+ * Assumes that the <= operator for the given input types is defined.
+ */
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class leq {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c \f$ will be set to
+ * static_cast(true) if \f$ a \leq b \f$, and to
+ * static_cast(false) otherwise.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a <= *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = false;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = \min\{a,b\} \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a < *b ) {
- *c = static_cast< OUT >( *b - *a );
- } else {
- *c = static_cast< OUT >( *a - *b );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a < *c ) {
- *c -= *a;
- } else {
- *c = static_cast< OUT >( *a - *c );
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b < *c ) {
- *c -= *b;
- } else {
- *c = static_cast< OUT >( *b - *c );
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a <= *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
}
- }
- };
- /** \todo add documentation */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class relu {
- public:
- /** Alias to the left-hand input data type. */
- typedef IN1 left_type;
-
- /** Alias to the right-hand input data type. */
- typedef IN2 right_type;
-
- /** Alias to the output data type. */
- typedef OUT result_type;
-
- /** Whether this operator has an in-place foldl. */
- static constexpr bool has_foldl = true;
-
- /** Whether this operator has an in-place foldr. */
- static constexpr bool has_foldr = true;
-
- /**
- * Whether this operator is \em mathematically associative; that is,
- * associative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_associative = true;
-
- /**
- * Whether this operator is \em mathematically commutative; that is,
- * commutative when assuming equivalent data types for \a IN1, \a IN2,
- * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
- */
- static constexpr bool is_commutative = true;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = ReLU\{a,b\} = \begin{cases}
- * a \text{, if } a>b \\
- * b \text{, otherwise}
- * \end{cases}\f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( *a < *b ) {
- *c = static_cast< OUT >( *b );
- } else {
- *c = static_cast< OUT >( *a );
- }
- }
-
- /**
- * In-place left-to-right folding.
- *
- * @param[in] a Pointer to the left-hand side input data.
- * @param[in,out] c Pointer to the right-hand side input data. This also
- * dubs as the output memory area.
- */
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- if( *a > *c ) {
- *c = *a;
- }
- }
-
- /**
- * In-place right-to-left folding.
- *
- * @param[in,out] c Pointer to the left-hand side input data. This also
- * dubs as the output memory area.
- * @param[in] b Pointer to the right-hand side input data.
- */
- static void foldl( result_type * __restrict__ const c, const right_type * __restrict__ const b ) {
- if( *b > *c ) {
- *c = *b;
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b <= *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
}
- }
- };
- template< typename D1, typename D2, typename D3, enum Backend implementation = config::default_backend >
- class square_diff {
- public:
- typedef D1 left_type;
- typedef D2 right_type;
- typedef D3 result_type;
-
- static constexpr bool has_foldl = true;
- static constexpr bool has_foldr = true;
- static constexpr bool is_associative = false;
- static constexpr bool is_commutative = true;
-
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = ( *a - *b ) * ( *a - *b );
- }
-
- static void foldr( const left_type * __restrict__ const a, result_type * __restrict__ const c ) {
- *c = ( *a - *c ) * ( *a - *c );
- }
-
- static void foldl( const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = ( *c - *b ) * ( *c - *b );
- }
};
/**
- * left operand of type IN1,
- * right operand of type IN2
- * result of type std::pair< IN1, IN2 >
+ * The greater-than-or-equal operator.
*
- * for use together with argmin
+ * Assumes that the >= operator for the given input types is defined.
*/
- template< typename IN1, typename IN2, enum Backend implementation = config::default_backend >
- class zip {
- public:
- typedef IN1 left_type;
- typedef IN2 right_type;
- typedef std::pair< IN1, IN2 > result_type;
-
- static constexpr bool has_foldl = false;
- static constexpr bool has_foldr = false;
- static constexpr bool is_associative = false;
- static constexpr bool is_commutative = false;
-
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- *c = std::make_pair( *a, *b );
- }
- };
+ template<
+ typename IN1, typename IN2, typename OUT,
+ enum Backend implementation = config::default_backend
+ >
+ class geq {
+
+ public:
+
+ /** Alias to the left-hand input data type. */
+ typedef IN1 left_type;
+
+ /** Alias to the right-hand input data type. */
+ typedef IN2 right_type;
+
+ /** Alias to the output data type. */
+ typedef OUT result_type;
+
+ /** Whether this operator has an in-place foldl. */
+ static constexpr bool has_foldl = true;
+
+ /** Whether this operator has an in-place foldr. */
+ static constexpr bool has_foldr = true;
+
+ /**
+ * Whether this operator is \em mathematically associative; that is,
+ * associative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_associative = false;
+
+ /**
+ * Whether this operator is \em mathematically commutative; that is,
+ * commutative when assuming equivalent data types for \a IN1, \a IN2,
+ * and \a OUT, as well as assuming exact arithmetic, no overflows, etc.
+ */
+ static constexpr bool is_commutative = false;
+
+ /**
+ * Out-of-place application of this operator.
+ *
+ * @param[in] a The left-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[in] b The right-hand side input. Must be pre-allocated and
+ * initialised.
+ * @param[out] c The output. Must be pre-allocated.
+ *
+ * At the end of the operation, \f$ c \f$ will be set to
+ * static_cast(true) if \f$ a \geq b \f$, and to
+ * static_cast(false) otherwise.
+ */
+ static void apply(
+ const left_type * __restrict__ const a,
+ const right_type * __restrict__ const b,
+ result_type * __restrict__ const c
+ ) {
+ if( *a >= *b ) {
+ *c = static_cast< OUT >( true );
+ } else {
+ *c = static_cast< OUT >( false );
+ }
+ }
+
+ /**
+ * In-place left-to-right folding.
+ *
+ * @param[in] a Pointer to the left-hand side input data.
+ * @param[in,out] c Pointer to the right-hand side input data. This also
+ * dubs as the output memory area.
+ */
+ static void foldr(
+ const left_type * __restrict__ const a,
+ result_type * __restrict__ const c
+ ) {
+ if( *a >= *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
+
+ /**
+ * In-place right-to-left folding.
+ *
+ * @param[in,out] c Pointer to the left-hand side input data. This also
+ * dubs as the output memory area.
+ * @param[in] b Pointer to the right-hand side input data.
+ */
+ static void foldl(
+ result_type * __restrict__ const c,
+ const right_type * __restrict__ const b
+ ) {
+ if( *b >= *c ) {
+ *c = static_cast< result_type >( true );
+ } else {
+ *c = static_cast< result_type >( false );
+ }
+ }
- /**
- * compares the first argument of a pair
- */
- template< typename IN1, typename IN2, typename OUT, enum Backend implementation = config::default_backend >
- class equal_first {
- public:
- typedef IN1 left_type;
-
- typedef IN2 right_type;
-
- typedef OUT result_type;
-
- static constexpr bool has_foldl = false;
- static constexpr bool has_foldr = false;
- static constexpr bool is_associative = false;
- static constexpr bool is_commutative = false;
-
- /**
- * Out-of-place application of this operator.
- *
- * @param[in] a The left-hand side input. Must be pre-allocated and initialised.
- * @param[in] b The right-hand side input. Must be pre-allocated and initialised.
- * @param[out] c The output. Must be pre-allocated.
- *
- * At the end of the operation, \f$ c = a->first == b->first \f$.
- */
- static void apply( const left_type * __restrict__ const a, const right_type * __restrict__ const b, result_type * __restrict__ const c ) {
- if( a->first == b->first ) {
- *c = static_cast< OUT >( true );
- } else {
- *c = static_cast< OUT >( false );
- }
- }
};
/**
@@ -1847,67 +2706,75 @@ namespace grb {
template< typename OP, enum Backend implementation = config::default_backend >
class OperatorBase {
- protected:
- /** The block size that should be used during map-like operations. */
- static constexpr size_t blocksize = grb::utils::static_min( grb::config::SIMD_BLOCKSIZE< typename OP::left_type >::value(),
- grb::utils::static_min( grb::config::SIMD_BLOCKSIZE< typename OP::right_type >::value(), grb::config::SIMD_BLOCKSIZE< typename OP::result_type >::value() ) );
-
- /** The left-hand side input domain. */
- typedef typename OP::left_type D1;
-
- /** The right-hand side input domain. */
- typedef typename OP::right_type D2;
-
- /** The output domain. */
- typedef typename OP::result_type D3;
-
- public:
- /** @return Whether this operator is mathematically associative. */
- static constexpr bool is_associative() {
- return OP::is_associative;
- }
-
- /** @return Whether this operator is mathematically commutative. */
- static constexpr bool is_commutative() {
- return OP::is_commutative;
- }
-
- /**
- * Straightforward application of this operator. Computes \f$ x \odot y \f$
- * and stores the result in \a z.
- *
- * @tparam InputType1 The type of the input parameter \a x.
- * @tparam InputType2 The type of the input parameter \a y.
- * @tparam OutputType The type of the output parameter \a z.
- *
- * \warning If \a InputType1 does not match \a D! \em or \a InputType2 does
- * not match \a D2 \em or \a OutputType does not match \a D3, then
- * the input will be cast into temporary variables of the correct
- * types, while the output will be cast from a temporary variable,
- *
- * \note Best performance is thus only guaranteed when all domains match.
- *
- * @param[in] x The left-hand side input.
- * @param[in] y The right-hand side input.
- * @param[out] z The output element.
- */
- template< typename InputType1, typename InputType2, typename OutputType >
- static void apply( const InputType1 & x, const InputType2 & y, OutputType & z ) {
- const D1 a = static_cast< D1 >( x );
- const D2 b = static_cast< D2 >( y );
- D3 temp;
- OP::apply( &a, &b, &temp );
- z = static_cast< OutputType >( temp );
- }
-
- /**
- * This is the high-performance version of apply() in the sense that no
- * casting is required. This version will be automatically caled whenever
- * possible.
- */
- static void apply( const D1 & x, const D2 & y, D3 & out ) {
- OP::apply( &x, &y, &out );
- }
+ protected:
+
+ /** The block size that should be used during map-like operations. */
+ static constexpr size_t blocksize = grb::utils::static_min(
+ grb::config::SIMD_BLOCKSIZE< typename OP::left_type >::value(),
+ grb::utils::static_min(
+ grb::config::SIMD_BLOCKSIZE< typename OP::right_type >::value(),
+ grb::config::SIMD_BLOCKSIZE< typename OP::result_type >::value()
+ )
+ );
+
+ /** The left-hand side input domain. */
+ typedef typename OP::left_type D1;
+
+ /** The right-hand side input domain. */
+ typedef typename OP::right_type D2;
+
+ /** The output domain. */
+ typedef typename OP::result_type D3;
+
+ public:
+
+ /** @return Whether this operator is mathematically associative. */
+ static constexpr bool is_associative() {
+ return OP::is_associative;
+ }
+
+ /** @return Whether this operator is mathematically commutative. */
+ static constexpr bool is_commutative() {
+ return OP::is_commutative;
+ }
+
+ /**
+ * Straightforward application of this operator. Computes \f$ x \odot y \f$
+ * and stores the result in \a z.
+ *
+ * @tparam InputType1 The type of the input parameter \a x.
+ * @tparam InputType2 The type of the input parameter \a y.
+ * @tparam OutputType The type of the output parameter \a z.
+ *
+ * \warning If \a InputType1 does not match \a D! \em or \a InputType2 does
+ * not match \a D2 \em or \a OutputType does not match \a D3, then
+ * the input will be cast into temporary variables of the correct
+ * types, while the output will be cast from a temporary variable,
+ *
+ * \note Best performance is thus only guaranteed when all domains match.
+ *
+ * @param[in] x The left-hand side input.
+ * @param[in] y The right-hand side input.
+ * @param[out] z The output element.
+ */
+ template< typename InputType1, typename InputType2, typename OutputType >
+ static void apply( const InputType1 & x, const InputType2 & y, OutputType & z ) {
+ const D1 a = static_cast< D1 >( x );
+ const D2 b = static_cast< D2 >( y );
+ D3 temp;
+ OP::apply( &a, &b, &temp );
+ z = static_cast< OutputType >( temp );
+ }
+
+ /**
+ * This is the high-performance version of apply() in the sense that no
+ * casting is required. This version will be automatically caled whenever
+ * possible.
+ */
+ static void apply( const D1 & x, const D2 & y, D3 & out ) {
+ OP::apply( &x, &y, &out );
+ }
+
};
/**
@@ -1936,156 +2803,170 @@ namespace grb {
* @see Operator for full details.
* @see OperatorBase for additional functions exposed to the final operator.
*/
- template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+ template<
+ typename OP, typename guard = void,
+ enum Backend implementation = config::default_backend
+ >
class OperatorFR : public OperatorBase< OP > {
- public:
- /**
- * Emulated in-place application of this operator on two data elements.
- *
- * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
- *
- * We wish to call this in-place variant internally for brevity. However,
- * if \a OP has no in-place variant, then we must cache the previous
- * value of the output element or otherwise we will breach the
- * __restrict__ contract of OP::apply.
- * The caller must ensure the appropriate domains and casting behaviour
- * is applicable. Note that a user is never to call these functions
- * explicitly.
- *
- * @tparam InputType The type of the parameter \a x.
- * @tparam IOType The type of the parameter \a y.
- *
- * \warning Additional casting and use of temporary variables may occur
- * when \a InputType does not match \a D1 \em or \a IOType
- * does not match \a D3.
- *
- * \note This implementation relies on apply().
- *
- * @param[in] x The value that is to be applied to \a y.
- * @param[in,out] y The value \a x is to be applied against.
- */
- template< typename InputType, typename IOType >
- static void foldr( const InputType & x, IOType & y ) {
- typedef typename OperatorBase< OP >::D2 D2;
- const D2 cache = static_cast< D2 >( y );
- OperatorBase< OP >::apply( x, cache, y );
- }
-
- /**
- * Out-of-place element-wise foldr function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot z_i \f$ and stores the result into
- * \f$ z_i \f$.
- *
- * @tparam InputType The type of elements in \a x.
- * @tparam IOType The type of elements in \a z.
- *
- * @param x The left-hand side input data.
- * @param z Where \a x shall be mapped into.
- * @param n How many data elements \a x and \a z contain.
- *
- * This version requires three buffers, streams \a x once,
- * and streams \a z twice (once for reading, once for
- * writing.
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
- // local buffers
- typedef typename OperatorBase< OP >::D1 D1;
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- D1 left_buffer[ OperatorBase< OP >::blocksize ];
- D2 right_buffer[ OperatorBase< OP >::blocksize ];
- D3 result_buffer[ OperatorBase< OP >::blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + OperatorBase< OP >::blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- right_buffer[ b ] = static_cast< D2 >( z[ i ] );
- }
-
- // rewind source and output
- i -= OperatorBase< OP >::blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
- OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
- z[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
- OP::apply( left_buffer, right_buffer, result_buffer );
- z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
-
- /**
- * Out-of-place element-wise foldr function. Calculates
- * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x \odot z_i \f$ and stores the result into
- * \f$ z_i \f$.
- *
- * @tparam InputType The type of elements in \a x.
- * @tparam IOType The type of elements in \a z.
- *
- * @param x The left-hand side input value.
- * @param z Where \a x shall be mapped into.
- * @param n How many data elements \a z contains.
- *
- * This version requires two buffers and streams \a z
- * twice (once for reading, once for writing).
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
- // local buffers
- typedef typename OperatorBase< OP >::D1 D1;
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- const D1 left_buffer = x; // this is actually mandatory in case x is a temporary
- D2 right_buffer[ OperatorBase< OP >::blocksize ];
- D3 result_buffer[ OperatorBase< OP >::blocksize ];
+ public:
+ /**
+ * Emulated in-place application of this operator on two data elements.
+ *
+ * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+ *
+ * We wish to call this in-place variant internally for brevity. However,
+ * if \a OP has no in-place variant, then we must cache the previous
+ * value of the output element or otherwise we will breach the
+ * __restrict__ contract of OP::apply.
+ * The caller must ensure the appropriate domains and casting behaviour
+ * is applicable. Note that a user is never to call these functions
+ * explicitly.
+ *
+ * @tparam InputType The type of the parameter \a x.
+ * @tparam IOType The type of the parameter \a y.
+ *
+ * \warning Additional casting and use of temporary variables may occur
+ * when \a InputType does not match \a D1 \em or \a IOType
+ * does not match \a D3.
+ *
+ * \note This implementation relies on apply().
+ *
+ * @param[in] x The value that is to be applied to \a y.
+ * @param[in,out] y The value \a x is to be applied against.
+ */
+ template< typename InputType, typename IOType >
+ static void foldr( const InputType & x, IOType & y ) {
+ typedef typename OperatorBase< OP >::D2 D2;
+ const D2 cache = static_cast< D2 >( y );
+ OperatorBase< OP >::apply( x, cache, y );
+ }
- // blockwise application
- size_t i = 0;
- while( i + OperatorBase< OP >::blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
- right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+ /**
+ * Out-of-place element-wise foldr function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot z_i \f$ and stores the result into
+ * \f$ z_i \f$.
+ *
+ * @tparam InputType The type of elements in \a x.
+ * @tparam IOType The type of elements in \a z.
+ *
+ * @param x The left-hand side input data.
+ * @param z Where \a x shall be mapped into.
+ * @param n How many data elements \a x and \a z contain.
+ *
+ * This version requires three buffers, streams \a x once,
+ * and streams \a z twice (once for reading, once for
+ * writing.
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldrAA(
+ const InputType * __restrict__ const x,
+ IOType * __restrict__ const z,
+ const size_t n
+ ) {
+ // local buffers
+ typedef typename OperatorBase< OP >::D1 D1;
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ D1 left_buffer[ OperatorBase< OP >::blocksize ];
+ D2 right_buffer[ OperatorBase< OP >::blocksize ];
+ D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + OperatorBase< OP >::blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+ }
+
+ // rewind source and output
+ i -= OperatorBase< OP >::blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+ OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+ &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
}
- // rewind source and output
- i -= OperatorBase< OP >::blocksize;
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+ OP::apply( left_buffer, right_buffer, result_buffer );
+ z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
- // operate within buffer
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
- OP::apply( &left_buffer, &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+ /**
+ * Out-of-place element-wise foldr function. Calculates
+ * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x \odot z_i \f$ and stores the result into
+ * \f$ z_i \f$.
+ *
+ * @tparam InputType The type of elements in \a x.
+ * @tparam IOType The type of elements in \a z.
+ *
+ * @param x The left-hand side input value.
+ * @param z Where \a x shall be mapped into.
+ * @param n How many data elements \a z contains.
+ *
+ * This version requires two buffers and streams \a z
+ * twice (once for reading, once for writing).
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldrSA(
+ const InputType x, IOType * __restrict__ const z,
+ const size_t n
+ ) {
+ // local buffers
+ typedef typename OperatorBase< OP >::D1 D1;
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ const D1 left_buffer = x; // this is actually mandatory in case x is a
+ // temporary
+ D2 right_buffer[ OperatorBase< OP >::blocksize ];
+ D3 result_buffer[ OperatorBase< OP >::blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + OperatorBase< OP >::blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+ right_buffer[ b ] = static_cast< D2 >( z[ i ] );
+ }
+
+ // rewind source and output
+ i -= OperatorBase< OP >::blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++b ) {
+ OP::apply( &left_buffer, &( right_buffer[ b ] ),
+ &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
}
- // write back result
- for( size_t b = 0; b < OperatorBase< OP >::blocksize; ++i, ++b ) {
- z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
+ OP::apply( &left_buffer, right_buffer, result_buffer );
+ z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
}
}
- // direct application for remainder
- for( ; i < n; ++i ) {
- right_buffer[ 0 ] = static_cast< D2 >( z[ i ] );
- OP::apply( &left_buffer, right_buffer, result_buffer );
- z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
};
/**
@@ -2105,223 +2986,244 @@ namespace grb {
* @see OperatorBase for additional functions exposed to the final operator.
*/
template< typename OP >
- class OperatorFR< OP, typename std::enable_if< OP::has_foldr && std::is_same< typename OP::right_type, typename OP::result_type >::value >::type > : public OperatorBase< OP > {
-
- private:
- typedef typename OperatorBase< OP >::D1 D1;
- typedef typename OperatorBase< OP >::D3 D3;
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- public:
- /**
- * In-place application of this operator on two data elements.
- *
- * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
- *
- * \note This variant is only called when the underlying raw operator
- * supports in-place operations.
- *
- * The caller must ensure the appropriate domains and casting behaviour
- * is applicable. Note that a user is never to call these functions
- * explicitly.
- *
- * @param[in] x The value that is to be applied to \a y.
- * @param[in,out] y The value \a x is to be applied against.
- */
- static void foldr( const D1 & x, D3 & y ) {
- OP::foldr( &x, &y );
- }
-
- /**
- * In-place element-wise foldr function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x \odot z_i \f$ and stores the result into \f$ z_i \f$.
- *
- * @tparam InputType The type of \a x.
- * @tparam IOType The type of elements in \a z.
- *
- * @param[in] x The left-hand side input value.
- * @param[in,out] z Where \a x shall be mapped into.
- * @param[in] n How many data elements \a z contains.
- *
- * This implementation requires one buffers only. It streams \a z twice,
- * once for reading, once for writing. This function should vectorise.
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldrSA( const InputType x, IOType * __restrict__ const z, const size_t n ) {
- // local buffers
- const D1 left_buffer = static_cast< D1 >( x );
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- result_buffer[ b ] = static_cast< D3 >( z[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldr( &left_buffer, &( result_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- z[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
- OP::foldr( &left_buffer, result_buffer );
- z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
-
- /**
- * In-place element-wise foldr function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot z_i \f$ and stores the result into \f$ z_i \f$.
- *
- * @tparam InputType The type of elements in \a x.
- * @tparam IOType The type of elements in \a z.
- *
- * @param[in] x The left-hand side input data.
- * @param[in,out] z Where \a x shall be mapped into.
- * @param[in] n How many data elements \a x and \a z contain.
- *
- * This implementation requires two buffers only. It streams \a x once,
- * while streaming \a z twice (once for reading, once for writing). This
- * function should vectorise.
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldrAA( const InputType * __restrict__ const x, IOType * __restrict__ const z, const size_t n ) {
- // local buffers
- D1 left_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- result_buffer[ b ] = static_cast< D3 >( z[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- z[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
- OP::foldr( left_buffer, result_buffer );
- z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
-
- /**
- * In-place element-wise apply function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ z_i = x_i \odot y_i \f$.
- *
- * @tparam InputType1 The type of elements in \a x.
- * @tparam InputType2 The type of elements in \a y.
- * @tparam OutputType The type of elements in \a z.
- *
- * If \a InputType2 and \a D3 are not the same, then the existing data in
- * \a y is cast to \a D3 prior to application of this in-place operator.
- * If \a InputType1 and \a D1 are not the same, then the existing data in
- * \a x are cast to \a D1 prior to application of this in-place operator.
- * If \a OutputType and \a D3 are not the same, then the results of
- * applying this operator are cast to \a OutputType prior to writing back
- * the results.
- *
- * \warning The first casting behaviour may not be what you want. The two
- * other casting behaviours are allowed by the GraphBLAS unless
- * the grb::descriptor::no_casting is given.
- *
- * \note By default, this GraphBLAS implementation will only use this
- * code when \a D2 matches \a D3 and OP::has_foldr is \a true.
- *
- * This implementation relies on an in-place foldr().
- *
- * @param[in] x The left-hand side input data. The memory range starting
- * at \a x and ending at \a x + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n (exclusive).
- * @param[in] y The right-hand side input data. The memory range starting
- * at \a y and ending at \a y + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n.
- * @param[out] z Where the map of \a x into \a y must be stored. This
- * pointer is restricted in the sense that its memory may
- * never overlap with those pointed to by \a x or \y, as
- * detailed above.
- * @param[in] n How many data elements \a x, \a y, and \a z contain.
- */
- template< typename InputType1, typename InputType2, typename OutputType >
- static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
-#ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
- std::cout << "In OperatorFR::eWiseApply\n";
-#endif
-#endif
- // NOTE: this variant is only active when the computation can be done using two buffers only
+ class OperatorFR<
+ OP,
+ typename std::enable_if<
+ OP::has_foldr &&
+ std::is_same< typename OP::right_type, typename OP::result_type >::value
+ >::type
+ > : public OperatorBase< OP > {
- // local buffers
- D1 left_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
+ private:
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
+ typedef typename OperatorBase< OP >::D1 D1;
+ typedef typename OperatorBase< OP >::D3 D3;
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ public:
+
+ /**
+ * In-place application of this operator on two data elements.
+ *
+ * Computes \f$ x \odot y \f$ and writes the result into \f$ y \f$.
+ *
+ * \note This variant is only called when the underlying raw operator
+ * supports in-place operations.
+ *
+ * The caller must ensure the appropriate domains and casting behaviour
+ * is applicable. Note that a user is never to call these functions
+ * explicitly.
+ *
+ * @param[in] x The value that is to be applied to \a y.
+ * @param[in,out] y The value \a x is to be applied against.
+ */
+ static void foldr( const D1 & x, D3 & y ) {
+ OP::foldr( &x, &y );
+ }
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- result_buffer[ b ] = static_cast< D3 >( y[ i ] );
+ /**
+ * In-place element-wise foldr function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x \odot z_i \f$ and stores the result into \f$ z_i \f$.
+ *
+ * @tparam InputType The type of \a x.
+ * @tparam IOType The type of elements in \a z.
+ *
+ * @param[in] x The left-hand side input value.
+ * @param[in,out] z Where \a x shall be mapped into.
+ * @param[in] n How many data elements \a z contains.
+ *
+ * This implementation requires one buffers only. It streams \a z twice,
+ * once for reading, once for writing. This function should vectorise.
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldrSA(
+ const InputType x, IOType * __restrict__ const z,
+ const size_t n
+ ) {
+ // local buffers
+ const D1 left_buffer = static_cast< D1 >( x );
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldr( &left_buffer, &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
}
- // rewind source and output
- i -= blocksize;
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+ OP::foldr( &left_buffer, result_buffer );
+ z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+ /**
+ * In-place element-wise foldr function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot z_i \f$ and stores the result into \f$ z_i \f$.
+ *
+ * @tparam InputType The type of elements in \a x.
+ * @tparam IOType The type of elements in \a z.
+ *
+ * @param[in] x The left-hand side input data.
+ * @param[in,out] z Where \a x shall be mapped into.
+ * @param[in] n How many data elements \a x and \a z contain.
+ *
+ * This implementation requires two buffers only. It streams \a x once,
+ * while streaming \a z twice (once for reading, once for writing). This
+ * function should vectorise.
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldrAA(
+ const InputType * __restrict__ const x,
+ IOType * __restrict__ const z,
+ const size_t n
+ ) {
+ // local buffers
+ D1 left_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ result_buffer[ b ] = static_cast< D3 >( z[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
}
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ result_buffer[ 0 ] = static_cast< D3 >( z[ i ] );
+ OP::foldr( left_buffer, result_buffer );
+ z[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
}
}
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< typename OP::left_type >( x[ i ] );
- result_buffer[ 0 ] = static_cast< typename OP::result_type >( y[ i ] );
- OP::foldr( left_buffer, result_buffer );
- z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+ /**
+ * In-place element-wise apply function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ z_i = x_i \odot y_i \f$.
+ *
+ * @tparam InputType1 The type of elements in \a x.
+ * @tparam InputType2 The type of elements in \a y.
+ * @tparam OutputType The type of elements in \a z.
+ *
+ * If \a InputType2 and \a D3 are not the same, then the existing data in
+ * \a y is cast to \a D3 prior to application of this in-place operator.
+ * If \a InputType1 and \a D1 are not the same, then the existing data in
+ * \a x are cast to \a D1 prior to application of this in-place operator.
+ * If \a OutputType and \a D3 are not the same, then the results of
+ * applying this operator are cast to \a OutputType prior to writing back
+ * the results.
+ *
+ * \warning The first casting behaviour may not be what you want. The two
+ * other casting behaviours are allowed by the GraphBLAS unless
+ * the grb::descriptor::no_casting is given.
+ *
+ * \note By default, this GraphBLAS implementation will only use this
+ * code when \a D2 matches \a D3 and OP::has_foldr is \a true.
+ *
+ * This implementation relies on an in-place foldr().
+ *
+ * @param[in] x The left-hand side input data. The memory range starting
+ * at \a x and ending at \a x + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n (exclusive).
+ * @param[in] y The right-hand side input data. The memory range starting
+ * at \a y and ending at \a y + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n.
+ * @param[out] z Where the map of \a x into \a y must be stored. This
+ * pointer is restricted in the sense that its memory may
+ * never overlap with those pointed to by \a x or \y, as
+ * detailed above.
+ * @param[in] n How many data elements \a x, \a y, and \a z contain.
+ */
+ template< typename InputType1, typename InputType2, typename OutputType >
+ static void eWiseApply(
+ const InputType1 * x,
+ const InputType2 * y,
+ OutputType * __restrict__ z,
+ const size_t n
+ ) {
+#ifdef _DEBUG
+ #ifdef D_GRB_NO_STDIO
+ std::cout << "In OperatorFR::eWiseApply\n";
+ #endif
+#endif
+ // NOTE: this variant is only active when the computation can be done using two buffers only
+
+ // local buffers
+ D1 left_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ result_buffer[ b ] = static_cast< D3 >( y[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldr( &( left_buffer[ b ] ), &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ }
+ }
+
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< typename OP::left_type >( x[ i ] );
+ result_buffer[ 0 ] = static_cast< typename OP::result_type >( y[ i ] );
+ OP::foldr( left_buffer, result_buffer );
+ z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
+ }
}
- }
+
};
/**
@@ -2350,156 +3252,170 @@ namespace grb {
* @see OperatorBase for additional functions exposed to the resulting
* operator.
*/
- template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+ template<
+ typename OP, typename guard = void,
+ enum Backend implementation = config::default_backend
+ >
class OperatorFL : public OperatorFR< OP > {
- private:
- public:
- typedef typename OperatorBase< OP >::D1 D1;
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- /**
- * Emulated in-place application of this operator on two data elements.
- *
- * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
- *
- * We wish to call this in-place variant internally for brevity. However,
- * if \a OP has no in-place variant, then we must cache the previous
- * value of the output element or otherwise we will breach the
- * __restrict__ contract of OP::apply.
- * The caller must ensure the appropriate domains and casting behaviour
- * is applicable. Note that a user is never to call these functions
- * explicitly.
- *
- * @tparam InputType The type of the parameter \a x.
- * @tparam IOType The type of the parameter \a y.
- *
- * \warning Additional casting and use of temporary variables may occur
- * when \a InputType does not match \a D2 \em or \a IOType
- * does not match \a D3.
- *
- * \note This implementation relies on apply().
- *
- * @param[in,out] x The value \a y is to be applied against.
- * @param[in] y The value that is to be applied to \a x.
- */
- template< typename InputType, typename IOType >
- static void foldl( IOType & x, const InputType & y ) {
- const D1 cache = static_cast< D1 >( x );
- OperatorBase< OP >::apply( cache, y, x );
- }
-
- /**
- * Out-of-place element-wise foldl function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
- *
- * @tparam IOType The type of elements in \a x.
- * @tparam InputType The type of \a y.
- *
- * @param[in, out] x At function entry, the left-hand side input data.
- * At function exit, the output data as defined above.
- * @param[in] y The right-hand side input value.
- * @param[in] n How many data elements \a x contains.
- *
- * This version requires two buffers and streams \a x twice (once for
- * reading, once for writing). This function should vectorise its
- * out-of-place operations.
- */
- template< typename IOType, typename InputType >
- static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
- // local buffers
- D1 left_buffer[ blocksize ];
- const D2 right_buffer = y;
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::apply( &( left_buffer[ b ] ), &right_buffer, &( result_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- x[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- OP::apply( left_buffer, &right_buffer, result_buffer );
- x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
-
- /**
- * Out-of-place element-wise foldl function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
- *
- * @tparam IOType The type of elements in \a x.
- * @tparam InputType The type of elements in \a y.
- *
- * @param[in, out] x At function entry, the left-hand side input data.
- * At function exit, the output data as defined above.
- * @param[in] y The right-hand side input.
- * @param[in] n How many data elements \a x and \a y contain.
- *
- * This version requires three buffers, streams \a y once, and streams
- * \a x twice (once for reading, once for writing). This function should
- * vectorise its out-of-place operations.
- */
- template< typename IOType, typename InputType >
- static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
- // local buffers
- D1 left_buffer[ blocksize ];
- D2 right_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- right_buffer[ b ] = static_cast< D2 >( y[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- x[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
- OP::apply( left_buffer, right_buffer, result_buffer );
- x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
+ public:
+
+ typedef typename OperatorBase< OP >::D1 D1;
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ /**
+ * Emulated in-place application of this operator on two data elements.
+ *
+ * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+ *
+ * We wish to call this in-place variant internally for brevity. However,
+ * if \a OP has no in-place variant, then we must cache the previous
+ * value of the output element or otherwise we will breach the
+ * __restrict__ contract of OP::apply.
+ * The caller must ensure the appropriate domains and casting behaviour
+ * is applicable. Note that a user is never to call these functions
+ * explicitly.
+ *
+ * @tparam InputType The type of the parameter \a x.
+ * @tparam IOType The type of the parameter \a y.
+ *
+ * \warning Additional casting and use of temporary variables may occur
+ * when \a InputType does not match \a D2 \em or \a IOType
+ * does not match \a D3.
+ *
+ * \note This implementation relies on apply().
+ *
+ * @param[in,out] x The value \a y is to be applied against.
+ * @param[in] y The value that is to be applied to \a x.
+ */
+ template< typename InputType, typename IOType >
+ static void foldl( IOType &x, const InputType &y ) {
+ const D1 cache = static_cast< D1 >( x );
+ OperatorBase< OP >::apply( cache, y, x );
+ }
+
+ /**
+ * Out-of-place element-wise foldl function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+ *
+ * @tparam IOType The type of elements in \a x.
+ * @tparam InputType The type of \a y.
+ *
+ * @param[in, out] x At function entry, the left-hand side input data.
+ * At function exit, the output data as defined above.
+ * @param[in] y The right-hand side input value.
+ * @param[in] n How many data elements \a x contains.
+ *
+ * This version requires two buffers and streams \a x twice (once for
+ * reading, once for writing). This function should vectorise its
+ * out-of-place operations.
+ */
+ template< typename IOType, typename InputType >
+ static void eWiseFoldlAS(
+ IOType * __restrict__ const x,
+ const InputType y,
+ const size_t n
+ ) {
+ // local buffers
+ D1 left_buffer[ blocksize ];
+ const D2 right_buffer = y;
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::apply( &( left_buffer[ b ] ), &right_buffer,
+ &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
+ }
+
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ OP::apply( left_buffer, &right_buffer, result_buffer );
+ x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
+
+ /**
+ * Out-of-place element-wise foldl function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+ *
+ * @tparam IOType The type of elements in \a x.
+ * @tparam InputType The type of elements in \a y.
+ *
+ * @param[in, out] x At function entry, the left-hand side input data.
+ * At function exit, the output data as defined above.
+ * @param[in] y The right-hand side input.
+ * @param[in] n How many data elements \a x and \a y contain.
+ *
+ * This version requires three buffers, streams \a y once, and streams
+ * \a x twice (once for reading, once for writing). This function should
+ * vectorise its out-of-place operations.
+ */
+ template< typename IOType, typename InputType >
+ static void eWiseFoldlAA(
+ IOType * __restrict__ const x,
+ const InputType * __restrict__ const y,
+ const size_t n
+ ) {
+ // local buffers
+ D1 left_buffer[ blocksize ];
+ D2 right_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+ &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
+ }
+
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+ OP::apply( left_buffer, right_buffer, result_buffer );
+ x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
+
};
/**
@@ -2517,139 +3433,154 @@ namespace grb {
* operator.
*/
template< typename OP >
- class OperatorFL< OP, typename std::enable_if< OP::has_foldl && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > : public OperatorFR< OP > {
-
- private:
- public:
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- /**
- * In-place application of this operator on two data elements.
- *
- * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
- *
- * \note This variant is only called when the underlying raw operator
- * supports in-place operations.
- *
- * The caller must ensure the appropriate domains and casting behaviour
- * is applicable. Note that a user is never to call these functions
- * explicitly.
- *
- * @param[in,out] x The value \a y is to be applied against.
- * @param[in] y The value that is to be applied to \a x.
- */
- static void foldl( D3 & x, const D2 & y ) {
- OP::foldl( &x, &y );
- }
-
- /**
- * In-place element-wise foldl function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
- *
- * @tparam IOType The type of elements in \a x.
- * @tparam InputType The type of elements in \a y.
- *
- * @param[in,out] x At function extry: the left-hand side input data.
- * At function exit: the result data.
- * @param[in] y The right-hand side input data.
- * @param[in] n How many data elements \a x and \a y contain.
- *
- * This implementation requires two buffers only. It streams \a y once,
- * while streaming \a x twice (once for reading, once for writing). This
- * function should vectorise.
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldlAA( IOType * __restrict__ const x, const InputType * __restrict__ const y, const size_t n ) {
- // local buffers
- D2 right_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- right_buffer[ b ] = static_cast< D2 >( y[ i ] );
- result_buffer[ b ] = static_cast< D3 >( x[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- x[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
- result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
- OP::foldl( result_buffer, right_buffer );
- x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
-
- /**
- * In-place element-wise foldl function. Calculates
- * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
- *
- * @tparam IOType The type of elements in \a x.
- * @tparam InputType The type of \a y.
- *
- * @param[in,out] x At function extry: the left-hand side input data.
- * At function exit: the result data.
- * @param[in] y The right-hand side input value.
- * @param[in] n How many data elements \a x contains.
- *
- * This implementation requires one buffers only. It streams \a x twice
- * (once for reading, once for writing). This function should vectorise.
- */
- template< typename InputType, typename IOType >
- static void eWiseFoldlAS( IOType * __restrict__ const x, const InputType y, const size_t n ) {
- // local buffers
- const D2 right_buffer = static_cast< D2 >( y );
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- result_buffer[ b ] = static_cast< D3 >( x[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldl( &( result_buffer[ b ] ), &right_buffer );
- }
-
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- x[ i ] = static_cast< IOType >( result_buffer[ b ] );
- }
- }
-
- // direct application for remainder
- for( ; i < n; ++i ) {
- result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
- OP::foldl( result_buffer, &right_buffer );
- x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
- }
- }
+ class OperatorFL<
+ OP,
+ typename std::enable_if<
+ OP::has_foldl &&
+ std::is_same< typename OP::left_type, typename OP::result_type >::value
+ >::type
+ > : public OperatorFR< OP > {
+
+ public:
+
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ /**
+ * In-place application of this operator on two data elements.
+ *
+ * Computes \f$ x \odot y \f$ and writes the result into \f$ x \f$.
+ *
+ * \note This variant is only called when the underlying raw operator
+ * supports in-place operations.
+ *
+ * The caller must ensure the appropriate domains and casting behaviour
+ * is applicable. Note that a user is never to call these functions
+ * explicitly.
+ *
+ * @param[in,out] x The value \a y is to be applied against.
+ * @param[in] y The value that is to be applied to \a x.
+ */
+ static void foldl( D3 &x, const D2 &y ) {
+ OP::foldl( &x, &y );
+ }
+
+ /**
+ * In-place element-wise foldl function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot y_i \f$ and stores the result into \f$ x_i \f$.
+ *
+ * @tparam IOType The type of elements in \a x.
+ * @tparam InputType The type of elements in \a y.
+ *
+ * @param[in,out] x At function extry: the left-hand side input data.
+ * At function exit: the result data.
+ * @param[in] y The right-hand side input data.
+ * @param[in] n How many data elements \a x and \a y contain.
+ *
+ * This implementation requires two buffers only. It streams \a y once,
+ * while streaming \a x twice (once for reading, once for writing). This
+ * function should vectorise.
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldlAA(
+ IOType * __restrict__ const x,
+ const InputType * __restrict__ const y,
+ const size_t n
+ ) {
+ // local buffers
+ D2 right_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+ result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
+ }
+
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+ result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+ OP::foldl( result_buffer, right_buffer );
+ x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
+
+ /**
+ * In-place element-wise foldl function. Calculates
+ * \f$ \forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ x_i \odot y \f$ and stores the result into \f$ x_i \f$.
+ *
+ * @tparam IOType The type of elements in \a x.
+ * @tparam InputType The type of \a y.
+ *
+ * @param[in,out] x At function extry: the left-hand side input data.
+ * At function exit: the result data.
+ * @param[in] y The right-hand side input value.
+ * @param[in] n How many data elements \a x contains.
+ *
+ * This implementation requires one buffers only. It streams \a x twice
+ * (once for reading, once for writing). This function should vectorise.
+ */
+ template< typename InputType, typename IOType >
+ static void eWiseFoldlAS(
+ IOType * __restrict__ const x,
+ const InputType y,
+ const size_t n
+ ) {
+ // local buffers
+ const D2 right_buffer = static_cast< D2 >( y );
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldl( &( result_buffer[ b ] ), &right_buffer );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ x[ i ] = static_cast< IOType >( result_buffer[ b ] );
+ }
+ }
+
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+ OP::foldl( result_buffer, &right_buffer );
+ x[ i ] = static_cast< IOType >( result_buffer[ 0 ] );
+ }
+ }
+
};
/**
@@ -2677,7 +3608,10 @@ namespace grb {
* @see OperatorBase for additional functions exposed to the resulting
* operator.
*/
- template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+ template<
+ typename OP, typename guard = void,
+ enum Backend implementation = config::default_backend
+ >
class OperatorNoFR : public OperatorFL< OP > {};
/**
@@ -2699,102 +3633,115 @@ namespace grb {
* operator.
*/
template< typename OP >
- class OperatorNoFR< OP, typename std::enable_if< OP::has_foldl && ! ( OP::has_foldr ) && std::is_same< typename OP::left_type, typename OP::result_type >::value >::type > :
- public OperatorFL< OP > {
-
- private:
- public:
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- /**
- * In-place element-wise apply function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ z_i = x_i \odot y_i \f$.
- *
- * @tparam InputType1 The type of elements in \a x.
- * @tparam InputType2 The type of elements in \a y.
- * @tparam OutputType The type of elements in \a z.
- *
- * If the \a InputType1 and \a D3 are not the same, then the existing data
- * in \a x is cast to \a D3 prior to application of this operator.
- * If \a InputType2 and \a D2 are not the same, then the existing data in
- * \a y is cast to \a D2 prior to application of this operator.
- * If \a OutputType and \a D3 are not the same, then the result of
- * applications of this operator are cast to \a OutputType prior to
- * writing it back to \a z.
- *
- * \warning The first casting behaviour may not be what you want. The two
- * other casting behaviours are allowed by the GraphBLAS unless
- * the grb::descriptor::no_casting is given.
- *
- * \note By default, this GraphBLAS implementation will only use this
- * code when \a D1 matches \a D3 and OP::has_foldr is \a true.
- * However, this implementation will never be enabled if \a D2
- * equals \a D3 and OP::has_foldl is \a true.
- *
- * This implementation relies on an in-place foldl().
- *
- * @param[in] x The left-hand side input data. The memory range starting
- * at \a x and ending at \a x + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n (exclusive).
- * @param[in] y The right-hand side input data. The memory range starting
- * at \a y and ending at \a y + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n.
- * @param[out] z Where the map of \a x into \a y must be stored. This
- * pointer is restricted in the sense that its memory may
- * never overlap with those pointed to by \a x or \y, as
- * detailed above.
- * @param[in] n How many data elements \a x, \a y, and \a z contain.
- */
- template< typename InputType1, typename InputType2, typename OutputType >
- static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
-#ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
- std::cout << "In OperatorNoFR::eWiseApply\n";
-#endif
-#endif
- // NOTE: this variant is only active when the computation can be done using two buffers only
-
- // local buffers
- D2 right_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
-
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- right_buffer[ b ] = static_cast< D2 >( y[ i ] );
- result_buffer[ b ] = static_cast< D3 >( x[ i ] );
- }
+ class OperatorNoFR<
+ OP,
+ typename std::enable_if<
+ OP::has_foldl &&
+ !(OP::has_foldr) &&
+ std::is_same< typename OP::left_type, typename OP::result_type >::value
+ >::type
+ > : public OperatorFL< OP > {
- // rewind source and output
- i -= blocksize;
+ public:
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ /**
+ * In-place element-wise apply function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ z_i = x_i \odot y_i \f$.
+ *
+ * @tparam InputType1 The type of elements in \a x.
+ * @tparam InputType2 The type of elements in \a y.
+ * @tparam OutputType The type of elements in \a z.
+ *
+ * If the \a InputType1 and \a D3 are not the same, then the existing data
+ * in \a x is cast to \a D3 prior to application of this operator.
+ * If \a InputType2 and \a D2 are not the same, then the existing data in
+ * \a y is cast to \a D2 prior to application of this operator.
+ * If \a OutputType and \a D3 are not the same, then the result of
+ * applications of this operator are cast to \a OutputType prior to
+ * writing it back to \a z.
+ *
+ * \warning The first casting behaviour may not be what you want. The two
+ * other casting behaviours are allowed by the GraphBLAS unless
+ * the grb::descriptor::no_casting is given.
+ *
+ * \note By default, this GraphBLAS implementation will only use this
+ * code when \a D1 matches \a D3 and OP::has_foldr is \a true.
+ * However, this implementation will never be enabled if \a D2
+ * equals \a D3 and OP::has_foldl is \a true.
+ *
+ * This implementation relies on an in-place foldl().
+ *
+ * @param[in] x The left-hand side input data. The memory range starting
+ * at \a x and ending at \a x + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n (exclusive).
+ * @param[in] y The right-hand side input data. The memory range starting
+ * at \a y and ending at \a y + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n.
+ * @param[out] z Where the map of \a x into \a y must be stored. This
+ * pointer is restricted in the sense that its memory may
+ * never overlap with those pointed to by \a x or \y, as
+ * detailed above.
+ * @param[in] n How many data elements \a x, \a y, and \a z contain.
+ */
+ template< typename InputType1, typename InputType2, typename OutputType >
+ static void eWiseApply(
+ const InputType1 * x,
+ const InputType2 * y,
+ OutputType * __restrict__ z,
+ const size_t n
+ ) {
+#ifdef _DEBUG
+ #ifdef D_GRB_NO_STDIO
+ std::cout << "In OperatorNoFR::eWiseApply\n";
+ #endif
+#endif
+ // NOTE: this variant is only active when the computation can be done
+ // using two buffers only
+
+ // local buffers
+ D2 right_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+ result_buffer[ b ] = static_cast< D3 >( x[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldl( &( result_buffer[ b ] ), &( right_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ }
}
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+ result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
+ OP::foldl( result_buffer, right_buffer );
+ z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
}
}
- // direct application for remainder
- for( ; i < n; ++i ) {
- right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
- result_buffer[ 0 ] = static_cast< D3 >( x[ i ] );
- OP::foldl( result_buffer, right_buffer );
- z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
- }
- }
};
/**
@@ -2832,7 +3779,10 @@ namespace grb {
* @see OperatorBase for additional functions exposed to the resulting
* operator.
*/
- template< typename OP, typename guard = void, enum Backend implementation = config::default_backend >
+ template<
+ typename OP, typename guard = void,
+ enum Backend implementation = config::default_backend
+ >
class OperatorNoFRFL : public OperatorNoFR< OP > {};
/**
@@ -2862,105 +3812,119 @@ namespace grb {
*/
template< typename OP >
class OperatorNoFRFL< OP,
- typename std::enable_if< ( ! ( OP::has_foldl ) || ! ( std::is_same< typename OP::left_type, typename OP::result_type >::value ) ) &&
- ( ! ( OP::has_foldr ) || ! ( std::is_same< typename OP::right_type, typename OP::result_type >::value ) ) >::type > : public OperatorNoFR< OP > {
-
- private:
- public:
- typedef typename OperatorBase< OP >::D1 D1;
- typedef typename OperatorBase< OP >::D2 D2;
- typedef typename OperatorBase< OP >::D3 D3;
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- /** \anchor OperatorNoFRFLeWiseApply
- *
- * Standard out-of-place element-wise apply function. Calculates
- * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
- * \f$ z_i = x_i \odot y_i \f$.
- *
- * This is the non-public variant that operates on raw arrays.
- *
- * @tparam InputType1 The type of elements in \a x.
- * @tparam InputType2 The type of elements in \a y.
- * @tparam OutputType The type of elements in \a z.
- *
- * If \a InputType1 and \a D1 are not the same, then the existing data in
- * \a x will be cast to \a D1 prior to application of this operator.
- * If \a InputType2 and \a D2 are not the same, then the existing data in
- * \a y will be cast to \a D2 prior to application of this operator.
- * If \a OutputType and \a D3 are not the same, then the results of
- * applications of this operator are cast to \a OutputType prior to
- * writing them back to \a z.
- *
- * \note The GraphBLAS can explicitly control all \em three of this
- * casting behaviours via grb::descriptors::no_casting.
- *
- * \warning With the in-place variants of this code, unwanted behaviour
- * cannot be prevented by use of grb::descriptors::no_casting.
- * Therefore the current implementation only calls the in-place
- * variants when \a D1 equals \a D3 (for foldl-based in-place),
- * or when \a D2 equals \a D3 (for foldr-based ones).
- *
- * @param[in] x The left-hand side input data. The memory range starting
- * at \a x and ending at \a x + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n (exclusive).
- * @param[in] y The right-hand side input data. The memory range starting
- * at \a y and ending at \a y + n (exclusive) may not
- * overlap with the memory area starting at \a z and ending
- * at \a z + n.
- * @param[out] z Where the map of \a x into \a y must be stored. This
- * pointer is restricted in the sense that its memory may
- * never overlap with those pointed to by \a x or \y, as
- * detailed above.
- * @param[in] n How many data elements \a x, \a y, and \a z contain.
- */
- template< typename InputType1, typename InputType2, typename OutputType >
- static void eWiseApply( const InputType1 * x, const InputType2 * y, OutputType * __restrict__ z, const size_t n ) {
+ typename std::enable_if< (
+ !(OP::has_foldl) ||
+ !(std::is_same< typename OP::left_type, typename OP::result_type >::value)
+ ) && (
+ !(OP::has_foldr) ||
+ !(std::is_same< typename OP::right_type, typename OP::result_type >::value)
+ )
+ >::type
+ > : public OperatorNoFR< OP > {
+
+ public:
+
+ typedef typename OperatorBase< OP >::D1 D1;
+ typedef typename OperatorBase< OP >::D2 D2;
+ typedef typename OperatorBase< OP >::D3 D3;
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ /** \anchor OperatorNoFRFLeWiseApply
+ *
+ * Standard out-of-place element-wise apply function. Calculates
+ * \f$\forall\ i \in \{ 0, 1, \ldots, n - 1 \}, \f$
+ * \f$ z_i = x_i \odot y_i \f$.
+ *
+ * This is the non-public variant that operates on raw arrays.
+ *
+ * @tparam InputType1 The type of elements in \a x.
+ * @tparam InputType2 The type of elements in \a y.
+ * @tparam OutputType The type of elements in \a z.
+ *
+ * If \a InputType1 and \a D1 are not the same, then the existing data in
+ * \a x will be cast to \a D1 prior to application of this operator.
+ * If \a InputType2 and \a D2 are not the same, then the existing data in
+ * \a y will be cast to \a D2 prior to application of this operator.
+ * If \a OutputType and \a D3 are not the same, then the results of
+ * applications of this operator are cast to \a OutputType prior to
+ * writing them back to \a z.
+ *
+ * \note The GraphBLAS can explicitly control all \em three of this
+ * casting behaviours via grb::descriptors::no_casting.
+ *
+ * \warning With the in-place variants of this code, unwanted behaviour
+ * cannot be prevented by use of grb::descriptors::no_casting.
+ * Therefore the current implementation only calls the in-place
+ * variants when \a D1 equals \a D3 (for foldl-based in-place),
+ * or when \a D2 equals \a D3 (for foldr-based ones).
+ *
+ * @param[in] x The left-hand side input data. The memory range starting
+ * at \a x and ending at \a x + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n (exclusive).
+ * @param[in] y The right-hand side input data. The memory range starting
+ * at \a y and ending at \a y + n (exclusive) may not
+ * overlap with the memory area starting at \a z and ending
+ * at \a z + n.
+ * @param[out] z Where the map of \a x into \a y must be stored. This
+ * pointer is restricted in the sense that its memory may
+ * never overlap with those pointed to by \a x or \y, as
+ * detailed above.
+ * @param[in] n How many data elements \a x, \a y, and \a z contain.
+ */
+ template< typename InputType1, typename InputType2, typename OutputType >
+ static void eWiseApply(
+ const InputType1 * x,
+ const InputType2 * y,
+ OutputType * __restrict__ z,
+ const size_t n
+ ) {
#ifdef _DEBUG
-#ifdef D_GRB_NO_STDIO
- std::cout << "In OperatorNoFRFL::eWiseApply\n";
-#endif
+ #ifdef D_GRB_NO_STDIO
+ std::cout << "In OperatorNoFRFL::eWiseApply\n";
+ #endif
#endif
- // NOTE: this variant is only active when the computation can NOT be done using two buffers only
-
- // local buffers
- D1 left_buffer[ blocksize ];
- D2 right_buffer[ blocksize ];
- D3 result_buffer[ blocksize ];
-
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
-
- // load into buffers
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- right_buffer[ b ] = static_cast< D2 >( y[ i ] );
- }
-
- // rewind source and output
- i -= blocksize;
-
- // operate within buffer
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ), &( result_buffer[ b ] ) );
+ // NOTE: this variant is only active when the computation can NOT be done using two buffers only
+
+ // local buffers
+ D1 left_buffer[ blocksize ];
+ D2 right_buffer[ blocksize ];
+ D3 result_buffer[ blocksize ];
+
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+
+ // load into buffers
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ b ] = static_cast< D2 >( y[ i ] );
+ }
+
+ // rewind source and output
+ i -= blocksize;
+
+ // operate within buffer
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::apply( &( left_buffer[ b ] ), &( right_buffer[ b ] ),
+ &( result_buffer[ b ] ) );
+ }
+
+ // write back result
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ }
}
- // write back result
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- z[ i ] = static_cast< OutputType >( result_buffer[ b ] );
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
+ OP::apply( left_buffer, right_buffer, result_buffer );
+ z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
}
}
- // direct application for remainder
- for( ; i < n; ++i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- right_buffer[ 0 ] = static_cast< D2 >( y[ i ] );
- OP::apply( left_buffer, right_buffer, result_buffer );
- z[ i ] = static_cast< OutputType >( result_buffer[ 0 ] );
- }
- }
};
/**
@@ -3085,108 +4049,117 @@ namespace grb {
* \snippet ops.hpp Operator Type Traits
* \endparblock
*/
- template< typename OP, enum Backend implementation = config::default_backend >
+ template<
+ typename OP,
+ enum Backend implementation = config::default_backend
+ >
class Operator : public OperatorNoFRFL< OP > {
- private:
- public:
- /** The maximum block size when vectorising this operation. */
- static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
-
- /** The left-hand side input domain of this operator. */
- typedef typename OperatorBase< OP >::D1 D1;
-
- /** The right-hand side input domain of this operator. */
- typedef typename OperatorBase< OP >::D2 D2;
-
- /** The output domain of this operator. */
- typedef typename OperatorBase< OP >::D3 D3;
-
- /**
- * Reduces a vector of type \a InputType into a value in \a IOType
- * by repeated application of this operator. The \a IOType is cast
- * into \a D3 prior reduction. The \a InputType is cast into \a D1
- * during reduction. The final result is cast to IOType after
- * reduction. The reduction happens `right-to-left'.
- *
- * This implementation relies on the \a foldr, whether it be an
- * true in-place or emulated version.
- *
- * @param[in,out] out On input, the initial value to be used for
- * reduction. On output, all elements of \a x
- * have been applied to \a out.
- * @param[in] x A vector of size \a n with elements of type \a left_type.
- * @param[in] n A positive integer (can be 0).
- */
- template< typename IOType, typename InputType >
- static void foldrArray( const InputType * __restrict__ const x, IOType & out, const size_t n ) {
- // prepare scalar buffer
- D3 reduced = static_cast< D3 >( out );
- // prepare vectorisation buffer
- D1 left_buffer[ blocksize ];
- // blockwise application
- size_t i = n - 1;
- while( i - blocksize + 1 < n ) {
- // load into buffer
- for( size_t b = 0; b < blocksize; --i, ++b ) {
- left_buffer[ b ] = static_cast< D1 >( x[ i ] );
- }
- // do reduce
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldr( &( left_buffer[ b ] ), &reduced );
- }
- }
- // direct application for remainder
- for( ; i < n; --i ) {
- left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
- OP::foldr( left_buffer, &reduced );
- }
- // write out
- out = static_cast< IOType >( reduced );
- }
-
- /**
- * Reduces a vector of type \a InputType into a value in \a IOType
- * by repeated application of this operator. The \a IOType is cast
- * into \a D3 prior reduction. The \a InputType is cast into \a D2
- * during reduction. The final result is cast to IOType after
- * reduction. The reduction happens `left-to-right'.
- *
- * This implementation relies on the \a foldr, whether it be an
- * true in-place or emulated version.
- *
- * @param[in,out] out On input, the initial value to be used for
- * reduction. On output, all elements of \a x
- * have been applied to \a out.
- * @param[in] x A vector of size \a n with elements of type \a left_type.
- * @param[in] n A positive integer (can be 0).
- */
- template< typename IOType, typename InputType >
- static void foldlArray( IOType & out, const InputType * __restrict__ const x, const size_t n ) {
- // prepare scalar buffer
- D3 reduced = static_cast< D3 >( out );
- // prepare vectorisation buffer
- D2 right_buffer[ blocksize ];
- // blockwise application
- size_t i = 0;
- while( i + blocksize <= n ) {
- // load into buffer
- for( size_t b = 0; b < blocksize; ++i, ++b ) {
- right_buffer[ b ] = static_cast< D2 >( x[ i ] );
- }
- // do reduce
- for( size_t b = 0; b < blocksize; ++b ) {
- OP::foldl( &reduced, &( right_buffer[ b ] ) );
- }
- }
- // direct application for remainder
- for( ; i < n; ++i ) {
- right_buffer[ 0 ] = static_cast< D2 >( x[ i ] );
- OP::foldl( &reduced, right_buffer );
- }
- // write out
- out = static_cast< IOType >( reduced );
- }
+ public:
+
+ /** The maximum block size when vectorising this operation. */
+ static constexpr size_t blocksize = OperatorBase< OP >::blocksize;
+
+ /** The left-hand side input domain of this operator. */
+ typedef typename OperatorBase< OP >::D1 D1;
+
+ /** The right-hand side input domain of this operator. */
+ typedef typename OperatorBase< OP >::D2 D2;
+
+ /** The output domain of this operator. */
+ typedef typename OperatorBase< OP >::D3 D3;
+
+ /**
+ * Reduces a vector of type \a InputType into a value in \a IOType
+ * by repeated application of this operator. The \a IOType is cast
+ * into \a D3 prior reduction. The \a InputType is cast into \a D1
+ * during reduction. The final result is cast to IOType after
+ * reduction. The reduction happens `right-to-left'.
+ *
+ * This implementation relies on the \a foldr, whether it be an
+ * true in-place or emulated version.
+ *
+ * @param[in,out] out On input, the initial value to be used for
+ * reduction. On output, all elements of \a x
+ * have been applied to \a out.
+ * @param[in] x A vector of size \a n with elements of type \a left_type.
+ * @param[in] n A positive integer (can be 0).
+ */
+ template< typename IOType, typename InputType >
+ static void foldrArray(
+ const InputType * __restrict__ const x,
+ IOType &out,
+ const size_t n
+ ) {
+ // prepare scalar buffer
+ D3 reduced = static_cast< D3 >( out );
+ // prepare vectorisation buffer
+ D1 left_buffer[ blocksize ];
+ // blockwise application
+ size_t i = n - 1;
+ while( i - blocksize + 1 < n ) {
+ // load into buffer
+ for( size_t b = 0; b < blocksize; --i, ++b ) {
+ left_buffer[ b ] = static_cast< D1 >( x[ i ] );
+ }
+ // do reduce
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldr( &( left_buffer[ b ] ), &reduced );
+ }
+ }
+ // direct application for remainder
+ for( ; i < n; --i ) {
+ left_buffer[ 0 ] = static_cast< D1 >( x[ i ] );
+ OP::foldr( left_buffer, &reduced );
+ }
+ // write out
+ out = static_cast< IOType >( reduced );
+ }
+
+ /**
+ * Reduces a vector of type \a InputType into a value in \a IOType
+ * by repeated application of this operator. The \a IOType is cast
+ * into \a D3 prior reduction. The \a InputType is cast into \a D2
+ * during reduction. The final result is cast to IOType after
+ * reduction. The reduction happens `left-to-right'.
+ *
+ * This implementation relies on the \a foldr, whether it be an
+ * true in-place or emulated version.
+ *
+ * @param[in,out] out On input, the initial value to be used for
+ * reduction. On output, all elements of \a x
+ * have been applied to \a out.
+ * @param[in] x A vector of size \a n with elements of type \a left_type.
+ * @param[in] n A positive integer (can be 0).
+ */
+ template< typename IOType, typename InputType >
+ static void foldlArray(
+ IOType &out, const InputType * __restrict__ const x, const size_t n
+ ) {
+ // prepare scalar buffer
+ D3 reduced = static_cast< D3 >( out );
+ // prepare vectorisation buffer
+ D2 right_buffer[ blocksize ];
+ // blockwise application
+ size_t i = 0;
+ while( i + blocksize <= n ) {
+ // load into buffer
+ for( size_t b = 0; b < blocksize; ++i, ++b ) {
+ right_buffer[ b ] = static_cast< D2 >( x[ i ] );
+ }
+ // do reduce
+ for( size_t b = 0; b < blocksize; ++b ) {
+ OP::foldl( &reduced, &( right_buffer[ b ] ) );
+ }
+ }
+ // direct application for remainder
+ for( ; i < n; ++i ) {
+ right_buffer[ 0 ] = static_cast< D2 >( x[ i ] );
+ OP::foldl( &reduced, right_buffer );
+ }
+ // write out
+ out = static_cast< IOType >( reduced );
+ }
};
} // namespace internal
diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp
index 4eb1a80fd..c0ed7e1cc 100644
--- a/include/graphblas/base/io.hpp
+++ b/include/graphblas/base/io.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies all I/O primitives for use with ALP/GraphBLAS containers.
+ *
* @author A. N. Yzelman
* @date 21st of February, 2017
*/
@@ -41,7 +45,8 @@
namespace grb {
/**
- * \defgroup IO Data Ingestion and Extraction.
+ * \defgroup IO Data Ingestion and Extraction
+ * \ingroup GraphBLAS
*
* Provides functions for putting user data into opaque ALP/GraphBLAS
* containers, provides functions for extracting data from such containers,
@@ -58,7 +63,7 @@ namespace grb {
* Sometimes it is desired to have direct access to ALP/GraphBLAS memory
* area, and to have that memory available even after the ALP/GraphBLAS context
* has been destroyed. This functionality is provided by the concept of
- * pinned containers such as provided by #PinnedVector.
+ * pinned containers such as provided by #grb::PinnedVector.
*
* Containers may be instantiated with default or given requested capacities.
* Implementations may reserve a higher capacity, but must allocate at least
@@ -400,6 +405,7 @@ namespace grb {
* A call to this function shall always succeed and shall never throw
* exceptions.
*
+ * \parblock
* \par Performance semantics.
* A call to this function:
* -# completes in \f$ \Theta(1) \f$ work.
@@ -546,21 +552,8 @@ namespace grb {
*
* @return grb::SUCCESS This function cannot fail.
*
- * \parblock
- * \par Performance semantics.
- * The backend must:
- * -# define cost in terms of work
- * -# define intra-process data movement costs
- * -# define inter-process data movement costs
- * -# define inter-process synchronisation requirements
- * -# define memory storage requirements and may define
- * this in terms of \a new_nz.
- * -# define whether system calls may be made and in particular whether
- * dynamic memory management may occor.
- * \endparblock
- *
- * \warning Calling clear shall not clear any dynamically allocated
- * memory associated with \a x.
+ * \warning Calling clear may not free any dynamically allocated memory
+ * associated with \a x. None of the present backends in fact do so.
*
* \note Even #grb::resize may or may not free dynamically allocated memory
* associated with \a x-- depending on the memory usage semantics defined
@@ -568,6 +561,14 @@ namespace grb {
*
* \note Only the destruction of \a x would ensure all corresponding memory is
* freed, for all backends.
+ *
+ * \parblock
+ * \par Performance semantics.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ * \endparblock
+ *
*/
template< typename DataType, Backend backend, typename Coords >
RC clear( Vector< DataType, backend, Coords > &x ) noexcept {
@@ -595,20 +596,7 @@ namespace grb {
* dimensions (i.e., row and column sizes) as well as the nonzero capacity
* remains unchanged.
*
- * @return grb::SUCCESS This function cannot fail.
- *
- * \parblock
- * \par Performance semantics.
- * The backend must:
- * -# define cost in terms of work
- * -# define intra-process data movement costs
- * -# define inter-process data movement costs
- * -# define inter-process synchronisation requirements
- * -# define memory storage requirements and may define
- * this in terms of \a new_nz.
- * -# define whether system calls may be made and in particular whether
- * dynamic memory management may occor.
- * \endparblock
+ * @return #grb::SUCCESS This function cannot fail.
*
* \warning Calling clear may not clear any dynamically allocated
* memory associated with \a A.
@@ -619,6 +607,13 @@ namespace grb {
*
* \note Only the destruction of \a A would ensure all corresponding memory is
* freed, for all backends.
+ *
+ * \parblock
+ * \par Performance semantics.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ * \endparblock
*/
template<
typename InputType, Backend backend,
@@ -691,27 +686,24 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * The backend must:
- * -# define cost in terms of work
- * -# define intra-process data movement costs
- * -# define inter-process data movement costs
- * -# define inter-process synchronisation requirements
- * -# define memory storage requirements and may define
- * this in terms of \a new_nz.
- * -# define whether system calls may be made and in particular whether
- * dynamic memory management may occor.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
- * \warning For most implementations, this function will indeed imply system
- * calls, as well as \f$ \Theta( \mathit{new\_nz} ) \f$ work and data
- * movement costs. It is thus to be considered an expensive function,
- * and should be used sparingly and only when absolutely necessary.
+ * \warning For most implementations, this function will imply system calls, as
+ * well as \f$ \Theta( \mathit{new\_nz} ) \f$ work and data movement
+ * costs. It is thus to be considered an expensive function, and
+ * should be used sparingly and only when absolutely necessary.
*/
template<
typename InputType,
Backend backend, typename Coords
>
- RC resize( Vector< InputType, backend, Coords > &x, const size_t new_nz ) noexcept {
+ RC resize(
+ Vector< InputType, backend, Coords > &x,
+ const size_t new_nz
+ ) noexcept {
#ifndef NDEBUG
const bool should_not_call_base_vector_resize = false;
#endif
@@ -777,12 +769,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * -# the backend must define cost in terms of work
- * -# the backend must define intra-process data movement costs
- * -# the backend must define inter-process data movement costs
- * -# the backend must define memory storage requirements and may define
- * this in terms of \a new_nz.
- * -# the backend must define whether system calls may be made.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \warning For useful backends, this function will indeed imply system calls
@@ -827,9 +816,9 @@ namespace grb {
* to the size of \a x.
* @param[in] val The value to set each element of \a x to.
* @param[in] phase Which #grb::Phase the operation is requested. Optional;
- * the default is #grb::Phase::EXECUTE.
+ * the default is #grb::EXECUTE.
*
- * In #grb::Phase::RESIZE mode:
+ * In #grb::RESIZE mode:
*
* @returns #grb::OUTOFMEM When \a x could not be resized to hold the
* requested output, and the current capacity was
@@ -837,17 +826,17 @@ namespace grb {
* @returns #grb::SUCCESS When the capacity of \a x was resized to guarantee
* the output of this operation can be contained.
*
- * In #grb::Phase::EXECUTE mode:
+ * In #grb::EXECUTE mode:
*
* @returns #grb::FAILED When \a x did not have sufficient capacity. The
* vector \a x on exit shall be cleared.
* @returns #grb::SUCCESS When the call completes successfully.
*
- * In #grb::Phase::TRY mode (experimental and may not be supported):
+ * In #grb::TRY mode (experimental and may not be supported):
*
* @returns #grb::FAILED When \a x did not have sufficient capacity. The
* vector \a x on exit will have contents defined as
- * described for #grb::Phase::TRY.
+ * described for #grb::TRY.
* @returns #grb::SUCCESS When the call completes successfully.
*
* When \a descr includes grb::descriptors::no_casting and if \a T does not
@@ -855,12 +844,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics
- * A backend must define, for each phase:
- * -# cost in terms of work
- * -# intra-process data movement costs
- * -# inter-process data movement costs
- * -# memory storage requirements and may define this in terms of \a new_nz.
- * -# whether system calls may be made.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*/
template<
@@ -909,14 +895,14 @@ namespace grb {
* evaluated depends on the given \a desc.
* @param[in] val The value to set elements of \a x to.
* @param[in] phase Which #grb::Phase the operation is requested. Optional;
- * the default is #grb::Phase::EXECUTE.
+ * the default is #grb::EXECUTE.
*
* \warning An empty \a mask, meaning #grb::size( \a mask ) is zero, shall
* be interpreted as though no mask argument was given. In particular,
* any descriptors pertaining to the interpretation of \a mask shall
* be ignored.
*
- * In #grb::Phase::RESIZE mode:
+ * In #grb::RESIZE mode:
*
* @returns #grb::OUTOFMEM When \a x could not be resized to hold the
* requested output, and the current capacity was
@@ -924,17 +910,17 @@ namespace grb {
* @returns #grb::SUCCESS When the capacity of \a x was resized to guarantee
* the output of this operation can be contained.
*
- * In #grb::Phase::EXECUTE mode:
+ * In #grb::EXECUTE mode:
*
* @returns #grb::FAILED When \a x did not have sufficient capacity. The
* vector \a x on exit shall be cleared.
* @returns #grb::SUCCESS When the call completes successfully.
*
- * In #grb::Phase::TRY mode (experimental and may not be supported):
+ * In #grb::TRY mode (experimental and may not be supported):
*
* @returns #grb::FAILED When \a x did not have sufficient capacity. The
* vector \a x on exit will have contents defined as
- * described for #grb::Phase::TRY.
+ * described for #grb::TRY.
* @returns #grb::SUCCESS When the call completes successfully.
*
* When \a descr includes grb::descriptors::no_casting and if \a T does not
@@ -942,13 +928,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics
- * A backend must define, for each phase:
- * -# cost in terms of work;
- * -# intra-process data movement costs;
- * -# inter-process data movement costs;
- * -# inter-process synchronisation costs;
- * -# memory storage requirements and may define this in terms of \a new_nz;
- * -# whether system calls may be made.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*/
template<
@@ -956,7 +938,8 @@ namespace grb {
typename DataType, typename MaskType, typename T,
Backend backend, typename Coords
>
- RC set( Vector< DataType, reference, Coords > &x,
+ RC set(
+ Vector< DataType, reference, Coords > &x,
const Vector< MaskType, backend, Coords > &mask,
const T val,
const Phase &phase = EXECUTE,
@@ -979,14 +962,6 @@ namespace grb {
* Sets the content of a given vector \a x to be equal to that of another given
* vector \a y.
*
- * Unmasked variant.
- *
- * \parblock
- * \par Accepted descriptors
- * -# grb::descriptors::no_operation
- * -# grb::descriptors::no_casting
- * \endparblock
- *
* @tparam descr The descriptor of the operation.
* @tparam OutputType The type of each element in the output vector.
* @tparam InputType The type of each element in the input vector.
@@ -996,34 +971,32 @@ namespace grb {
*
* The vector \a x may not be the same as \a y.
*
+ * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+ * the default is #grb::EXECUTE.
+ *
+ * \parblock
+ * \par Accepted descriptors
+ * -# grb::descriptors::no_operation
+ * -# grb::descriptors::no_casting
+ *
* When \a descr includes grb::descriptors::no_casting and if \a InputType
* does not match \a OutputType, the code shall not compile.
+ * \endparblock
*
* \parblock
* \par Performance semantics
- * A call to this function
- * -# consists of \f$ \Theta(n) \f$ work;
- * -# moves \f$ \Theta(n) \f$ bytes of memory;
- * -# does not allocate nor free any dynamic memory;
- * -# shall not make any system calls.
- * \endparblock
- *
- * @see grb::foldl.
- * @see grb::foldr.
- * @see grb::operators::left_assign.
- * @see grb::operators::right_assign.
- * @see grb::setElement.
+ * Each backend must define performance semantics for this primitive.
*
- * \todo Revise specification regarding recent changes on phases, performance
- * semantics, and capacities.
+ * @see perfSemantics
+ * \endparblock
*/
template<
Descriptor descr = descriptors::no_operation,
typename OutputType, typename InputType,
Backend backend, typename Coords
>
- RC set( Vector<
- OutputType, backend, Coords > &x,
+ RC set(
+ Vector< OutputType, backend, Coords > &x,
const Vector< InputType, backend, Coords > &y,
const Phase &phase = EXECUTE
) {
@@ -1041,50 +1014,50 @@ namespace grb {
* Sets the content of a given vector \a x to be equal to that of
* another given vector \a y.
*
- * Masked variant.
+ * If an entry with index \a i has that the corresponding \a mask entry
+ * evaluates false , then that entry shall not be copied into \a x.
*
* The vector \a x may not equal \a y.
*
- * @tparam descr The descriptor of the operation.
+ * @tparam descr The descriptor of the operation. Optional; default
+ * value is #grb::descriptors::no_operation.
* @tparam OutputType The type of each element in the output vector.
* @tparam MaskType The type of each element in the mask vector.
* @tparam InputType The type of each element in the input vector.
*
- * \parblock
- * \par Accepted descriptors
- * -# grb::descriptors::no_operation
- * -# grb::descriptors::no_casting
- * -# grb::descriptors::invert_mask
- * -# grb::descriptors::structural_mask
- * \endparblock
- *
* @param[in,out] x The vector to be set.
* @param[in] mask The output mask.
- * @param[in] y The source vector.
+ * @param[in] y The source vector. May not equal \a y.
+ * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+ * the default is #grb::EXECUTE.
*
- * When \a descr includes grb::descriptors::no_casting and if \a InputType
+ * \parblock
+ * \par Accepted descriptors
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense,
+ * - #grb::descriptors::invert_mask,
+ * - #grb::descriptors::structural, and
+ * - #grb::descriptors::structural_complement.
+ *
+ * When \a descr includes #grb::descriptors::no_casting and if \a InputType
* does not match \a OutputType, the code shall not compile.
+ * \endparblock
*
* \parblock
* \par Performance semantics
- * A call to this function
- * -# consists of \f$ \Theta( \min\{ nnz( mask ), nnz( y ) \} ) \f$ work;
- * -# moves \f$ \Theta( \min\{ nnz( mask ), nnz( y ) \} ) \f$ bytes of memory;
- * -# does not allocate nor free any dynamic memory;
- * -# shall not make any system calls.
- * If grb::descriptors::invert_mask is given, then \f$ nnz( mask ) \f$ in the
- * above shall be considered equal to \f$ nnz( y ) \f$.
- * \endparblock
+ * Each backend must define performance semantics for this primitive.
*
- * \todo Revise specification regarding recent changes on phases, performance
- * semantics, and capacities.
+ * @see perfSemantics
+ * \endparblock
*/
template<
Descriptor descr = descriptors::no_operation,
typename OutputType, typename MaskType, typename InputType,
Backend backend, typename Coords
>
- RC set( Vector< OutputType, backend, Coords > &x,
+ RC set(
+ Vector< OutputType, backend, Coords > &x,
const Vector< MaskType, backend, Coords > &mask,
const Vector< InputType, backend, Coords > &y,
const Phase &phase = EXECUTE,
@@ -1115,45 +1088,45 @@ namespace grb {
* The parameter \a i may not be greater or equal than the size of \a x.
*
* @tparam descr The descriptor to be used during evaluation of this
- * function.
+ * function. Optional; the default descriptor is
+ * #grb::descriptors::no_operation.
* @tparam DataType The type of the elements of \a x.
* @tparam T The type of the value to be set.
*
* @param[in,out] x The vector to be modified.
* @param[in] val The value \f$ x_i \f$ should read after function exit.
* @param[in] i The index of the element of \a x to set.
+ * @param[in] phase Which #grb::Phase the operation is requested. Optional;
+ * the default is #grb::EXECUTE.
*
- * @return grb::SUCCESS Upon successful execution of this operation.
- * @return grb::MISMATCH If \a i is greater or equal than the dimension of
- * \a x.
+ * @return #grb::SUCCESS Upon successful execution of this operation.
+ * @return #grb::MISMATCH If \a i is greater or equal than the dimension of
+ * \a x.
*
* \parblock
* \par Accepted descriptors
- * -# grb::descriptors::no_operation
- * -# grb::descriptors::no_casting
+ * - #grb::descriptors::no_operation,
+ * - #grb::descriptors::no_casting,
+ * - #grb::descriptors::dense.
* \endparblock
*
- * When \a descr includes grb::descriptors::no_casting and if \a T does not
+ * When \a descr includes #grb::descriptors::no_casting and if \a T does not
* match \a DataType, the code shall not compile.
*
* \parblock
* \par Performance semantics
- * A call to this function
- * -# consists of \f$ \Theta(1) \f$ work;
- * -# moves \f$ \Theta(1) \f$ bytes of memory;
- * -# does not allocate nor free any dynamic memory;
- * -# shall not make any system calls.
- * \endparblock
+ * Each backend must define performance semantics for this primitive.
*
- * \todo Revise specification regarding recent changes on phases, performance
- * semantics, and capacities.
+ * @see perfSemantics
+ * \endparblock
*/
template<
Descriptor descr = descriptors::no_operation,
typename DataType, typename T,
Backend backend, typename Coords
>
- RC setElement( Vector< DataType, backend, Coords > &x,
+ RC setElement(
+ Vector< DataType, backend, Coords > &x,
const T val,
const size_t i,
const Phase &phase = EXECUTE,
@@ -1234,56 +1207,36 @@ namespace grb {
* of elements. Any pre-existing nonzeroes that do not overlap with any nonzero
* between \a ind_start and \a ind_end will remain unchanged.
*
+ * @return #grb::SUCCESS When ingestion has completed successfully.
+ * @return #grb::ILLEGAL When a nonzero has an index larger than #grb::size.
+ * @return #grb::PANIC If an unmitigable error has occured during ingestion.
+ *
* \parblock
- * \par Performance semantics:
- * A call to this function
- * -# comprises \f$ \mathcal{O}( n ) \f$ work where \a n is the number of
- * elements pointed to by the given iterator pairs. This work may be
- * distributed over multiple user processes.
- * -# results in at most \f$ n \mathit{sizeof}( T ) +
- * n \mathit{sizeof}( U ) +
- * n \mathit{sizeof}( \mathit{InputType} ) +
- * 2 n \mathit{sizeof}( \mathit{bool} ) \f$
- * bytes of data movement, where \a T and \a U are the underlying data
- * types of the input iterators. These costs may be distributed over
- * multiple user processes.
- * -# inter-process communication costs are \f$ \mathcal{O}(n) g + l \f$.
- * -# if the capacity of this vector is not large enough to hold \a n
- * elements, a call to this function may allocate
- * \f$ \mathcal{O}( n ) \f$
- * new bytes of memory which \em may be distributed over multiple user
- * processes.
- * -# if the capacity of this vector is not large enough to hold \a n
- * elements, a call to this function may result in system calls at any of
- * the user processes.
- * -# If the IOMode is sequential, then the work and data movement costs are
- * incurred per user process and will not be distributed. In this
- * case the inter-process communication costs will, however, be zero.
- * -# if the IOMode is parallel, then a good implementation under a uniformly
- * randomly distributed input incurs an inter-process communication cost
- * of expected value \f$ n/p g + l \f$. The best-case inter-process cost
- * is \f$ (p-1)g + l \f$.
- * \endparblock
+ * \par Performance semantics
+ * Each backend must define performance semantics for this primitive.
*
- * @returns grb::SUCCESS When ingestion has completed successfully.
- * @returns grb::ILLEGAL When a nonzero has an index larger than grb::size(x).
- * @returns grb::PANIC If an unmitigable error has occured during ingestion.
+ * @see perfSemantics
+ * \endparblock
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
typename InputType,
class Merger = operators::right_assign< InputType >,
typename fwd_iterator1, typename fwd_iterator2,
Backend backend, typename Coords
>
- RC buildVectorUnique( Vector< InputType, backend, Coords > &x,
+ RC buildVectorUnique(
+ Vector< InputType, backend, Coords > &x,
fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
fwd_iterator2 val_start, const fwd_iterator2 val_end,
const IOMode mode
) {
- return buildVector< descr | descriptors::no_duplicates >( x,
+ return buildVector< descr | descriptors::no_duplicates >(
+ x,
ind_start, ind_end,
val_start, val_end,
- mode );
+ mode
+ );
}
/**
@@ -1291,8 +1244,15 @@ namespace grb {
*
* Invalidates any prior existing content. Disallows different nonzeroes
* to have the same row and column coordinates; input must consist out of
- * unique triples. See #buildMatrix for an alternate function that does
- * not have these restrictions-- at the cost of lower performance.
+ * unique triples.
+ *
+ * \internal
+ * See #buildMatrix for an alternate function that does not have these
+ * restrictions-- at the cost of lower performance.
+ *
+ * \todo Re-enable the above for public documentation once the non-unique
+ * buildMatrix variant has been implemented.
+ * \endinternal
*
* \warning Calling this function with duplicate input coordinates will
* lead to undefined behaviour.
@@ -1306,58 +1266,49 @@ namespace grb {
* @tparam fwd_iterator3 The type of the nonzero value iterator.
* @tparam length_type The type of the number of elements in each iterator.
*
- * The iterators will only be used to read from, never to assign to.
+ * @param[out] A Where to store the given nonzeroes.
*
* @param[in] I A forward iterator to \a cap row indices.
* @param[in] J A forward iterator to \a cap column indices.
* @param[in] V A forward iterator to \a cap nonzero values.
- * @param[in] nz The number of items pointed to by \a I, \a J, \em and \a V.
- *
- * @return grb::MISMATCH -# when an element from \a I dereferences to a value
- * larger than the row dimension of this matrix, or
- * -# when an element from \a J dereferences to a value
- * larger than the column dimension of this matrix.
- * When this error code is returned the state of this
- * container will be as though this function was never
- * called; however, the given forward iterators may
- * have been copied and the copied iterators may have
- * incurred multiple increments and dereferences.
- * @return grb::OVERFLW When the internal data type used for storing the
- * number of nonzeroes is not large enough to store
- * the number of nonzeroes the user wants to assign.
- * When this error code is returned the state of this
- * container will be as though this function was never
- * called; however, the given forward iterators may
- * have been copied and the copied iterators may have
- * incurred multiple increments and dereferences.
- * @return grb::SUCCESS When the function completes successfully.
*
- * \parblock
- * \par Performance semantics.
- * -# This function contains
- * \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ amount of work.
- * -# This function may dynamically allocate
- * \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of memory.
- * -# A call to this function will use \f$ \mathcal{O}(m+n) \f$ bytes
- * of memory beyond the memory in use at the function call entry.
- * -# This function will copy each input forward iterator at most
- * \em once; the three input iterators \a I, \a J, and \a V thus
- * may have exactly one copyeach, meaning that all input may be
- * traversed only once.
- * -# Each of the at most three iterator copies will be incremented
- * at most \f$ \mathit{nz} \f$ times.
- * -# Each position of the each of the at most three iterator copies
- * will be dereferenced exactly once.
- * -# This function moves
- * \f$ \Theta(\mathit{nz})+\mathcal{O}(m+n)) \f$ bytes of data.
- * -# This function will likely make system calls.
- * \endparblock
+ * @param[in] I_end A forward iterator in end position relative to \a I.
+ * @param[in] J_end A forward iterator in end position relative to \a J.
+ * @param[in] V_end A forward iterator in end position relative to \a V.
+ *
+ * The iterators will only be used to read from, never to assign to.
+ *
+ * @param[in] mode Whether the input should happen in #grb::SEQUENTIAL or in
+ * the #grb::PARALLEL mode.
+ *
+ * In the below, let \a nz denote the number of items pointed to by the
+ * iterator pair \a I, \a I_end. This number should match the number of
+ * elements in \a J, \a J_end \em and \a V, \a V_end.
+ *
+ * @return #grb::SUCCESS When the function completes successfully.
+ * @return #grb::MISMATCH When an element from \a I dereferences to a value
+ * larger than the row dimension of this matrix, or
+ * when an element from \a J dereferences to a value
+ * larger than the column dimension of this matrix.
+ * When this error code is returned the state of this
+ * container will be as though this function was never
+ * called; however, the given forward iterators may
+ * have been copied and the copied iterators may have
+ * incurred multiple increments and dereferences.
+ * @return #grb::OVERFLW When the internal data type used for storing the
+ * number of nonzeroes is not large enough to store
+ * the number of nonzeroes the user wants to assign.
+ * When this error code is returned the state of this
+ * container will be as though this function was never
+ * called; however, the given forward iterators may
+ * have been copied and the copied iterators may have
+ * incurred multiple increments and dereferences.
*
* \warning This is an expensive function. Use sparingly and only when
* absolutely necessary.
*
* \note Streaming input can be implemented by supplying buffered
- * iterators to this GraphBLAS implementation.
+ * iterators to ALP.
*
* \note The functionality herein described is exactly that of buildMatrix,
* though with stricter input requirements. These requirements allow
@@ -1366,6 +1317,13 @@ namespace grb {
* \note No masked version of this variant is provided. The use of masks in
* matrix construction is costly and the user is referred to the
* costly buildMatrix() function instead.
+ *
+ * \parblock
+ * \par Performance semantics.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
+ * \endparblock
*/
template<
Descriptor descr = descriptors::no_operation,
@@ -1484,6 +1442,8 @@ namespace grb {
* \a end.
* @param[in] start Iterator pointing to the first nonzero to be added.
* @param[in] end Iterator pointing past the last nonzero to be added.
+ * @param[in] mode Whether the input should happen in #grb::SEQUENTIAL or in
+ * the #grb::PARALLEL mode.
*/
template<
Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/base/matrix.hpp b/include/graphblas/base/matrix.hpp
index 2744a7434..343729d17 100644
--- a/include/graphblas/base/matrix.hpp
+++ b/include/graphblas/base/matrix.hpp
@@ -15,9 +15,13 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies the ALP/GraphBLAS matrix container.
+ *
* @author A. N. Yzelman
- * @date 10th of August
+ * @date 10th of August, 2016
*/
#ifndef _H_GRB_MATRIX_BASE
@@ -190,15 +194,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * Implementations must define cost semantics across the following
- * dimensions:
- * -# work;
- * -# intra-process data movement;
- * -# inter-process data movement;
- * -# inter-process synchronisations;
- * -# memory usage; and
- * -# whether system calls, in particular dynamic memory management calls,
- * could occur.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \warning Avoid the use of this constructor within performance critical
@@ -251,25 +249,21 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * Implementations must define cost semantics across the following
- * dimensions:
- * -# work;
- * -# intra-process data movement;
- * -# inter-process data movement;
- * -# inter-process synchronisations;
- * -# memory usage; and
- * -# whether system calls, in particular dynamic memory management calls,
- * could occur.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \warning Avoid the use of this constructor within performance critical
* code sections.
*/
- Matrix( const Matrix<
- D, implementation,
- RowIndexType, ColIndexType, NonzeroIndexType > &other
+ Matrix(
+ const Matrix<
+ D, implementation,
+ RowIndexType, ColIndexType, NonzeroIndexType
+ > &other
) {
- (void)other;
+ (void) other;
}
/**
@@ -281,18 +275,13 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * This constructor:
- * -# entails \f$ \Theta(1) \f$ amount of work;
- * -# moves \f$ \Theta(1) \f$ bytes of data within its user process;
- * -# moves \f$ 0 \f$ bytes of data between user processes;
- * -# shall \em not require synchronisations between user processes;
- * -# inherit the memory usage of \a other;
- * -# will \em not make system calls and in particular will not free
- * nor allocate dynamic memory.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*/
Matrix( self_type &&other ) {
- (void)other;
+ (void) other;
}
/**
@@ -306,16 +295,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * This constructor:
- * -# entails \f$ \Theta(1) \f$ amount of work;
- * -# moves \f$ \Theta(1) \f$ bytes of data within its user process;
- * -# moves \f$ 0 \f$ bytes of data between user processes;
- * -# shall \em not require synchronisations between user processes;
- * -# inherit the memory usage of \a other;
- * -# will \em not make system calls and in particular will not free
- * nor allocate dynamic memory.
- *
- * Additionally, the backend-specific cost of the matrix destructor apply.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*/
self_type& operator=( self_type &&other ) noexcept {
@@ -328,15 +310,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * Beckends must define cost semantics across the following
- * dimensions:
- * -# work;
- * -# intra-process data movement;
- * -# inter-process data movement;
- * -# inter-process synchronisations;
- * -# memory usage; and
- * -# whether system calls, in particular dynamic memory management calls,
- * could occur.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \warning Avoid calling destructors from within performance critical
@@ -358,15 +334,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * Beckends must define cost semantics across the following
- * dimensions:
- * -# work;
- * -# intra-process data movement;
- * -# inter-process data movement;
- * -# inter-process synchronisations;
- * -# memory usage; and
- * -# whether system calls, in particular dynamic memory management calls,
- * could occur.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \note This function may make use of a const_iterator that is buffered,
@@ -394,15 +364,9 @@ namespace grb {
*
* \parblock
* \par Performance semantics.
- * Beckends must define cost semantics across the following
- * dimensions:
- * -# work;
- * -# intra-process data movement;
- * -# inter-process data movement;
- * -# inter-process synchronisations;
- * -# memory usage; and
- * -# whether system calls, in particular dynamic memory management calls,
- * could occur.
+ * Each backend must define performance semantics for this primitive.
+ *
+ * @see perfSemantics
* \endparblock
*
* \note Even if cbegin() returns a buffered const_iterator that may require
diff --git a/include/graphblas/base/pinnedvector.hpp b/include/graphblas/base/pinnedvector.hpp
index 724e5712f..d13ffa022 100644
--- a/include/graphblas/base/pinnedvector.hpp
+++ b/include/graphblas/base/pinnedvector.hpp
@@ -18,7 +18,7 @@
/**
* @file
*
- * Contains the API for the PinnedVector class.
+ * Contains the specification for #grb::PinnedVector.
*
* @author A. N. Yzelman
*/
@@ -36,41 +36,40 @@
namespace grb {
- /** \addtogroup IO
+ /**
+ * Provides a mechanism to access ALP containers from outside of an ALP
+ * context.
*
- * Provides a mechanism to access GraphBLAS containers from outside of any
- * GraphBLAS context.
- *
- * An instance of \a PinnedVector caches a container's data and returns it
+ * An instance of #grb::PinnedVector caches a container's data and returns it
* to the user. The user can refer to the returned data until such time the
- * \a PinnedVector's instance is destroyed, regardless of whether a call to
+ * instance of #grb::PinnedVector is destroyed, regardless of whether a call to
* #grb::finalize occurs, and regardless whether the ALP/GraphBLAS program
* executed through the #grb::Launcher had already returned.
*
* The original container may not be modified or any derived instance of
* \a PinnedVector shall become invalid.
*
- * \note It would be strange if a GraphBLAS container a pinned vector is
+ * \note It would be strange if an ALP/GraphBLAS container a pinned vector is
* derived from persists-- pinned vectors are designed to be used
* precisely when the original container no longer is in scope.
* Therefore this last remark on invalidation should not matter.
*
- * The PinnedVector abstracts a container over nonzeroes. A nonzero is a pair
- * of indices and values. One may query for the number of nonzeroes and use
- * 1. #getNonzeroValue to retrieve a nonzero value, or
- * 2. #getNonzeroIndex to retrieve a nonzero index.
+ * The #grb::PinnedVector abstracts a read-only container of nonzeroes. A
+ * nonzero is a pair of indices and values. One may query for the number of
+ * nonzeroes and use
+ * 1. getNonzeroValue to retrieve a nonzero value, or
+ * 2. getNonzeroIndex to retrieve a nonzero index.
*
- * An instance of the PinnedVector cannot modify the underlying nonzero
- * structure nor its values.
+ * An instance of #grb::PinnedVector cannot modify the underlying nonzero
+ * structure nor can it modify its values.
*
* \note A performant implementation in fact does \em not copy the container
- * data, but provides a mechanism to access the underlying GraphBLAS
- * memory whenever it is possible to do so. This memory should remain
- * valid even after a call to grb::finalize() is made, and for as long
- * as the \a PinnedVector instance remains valid.
+ * data, but provides a mechanism to access the underlying ALP memory
+ * whenever it is possible to do so. This memory should remain valid
+ * even after a call to #grb::Launcher::exec has completed, and for as
+ * long as the #grb::PinnedVector instance remains valid.
*
- * \note Some implementations may not retain a raw vector. In this case, a
- * copy is unavoidable.
+ * \ingroup IO
*/
template< typename IOType, enum Backend implementation >
class PinnedVector {
@@ -78,8 +77,7 @@ namespace grb {
private :
/**
- * \internal Dummy false bool with a descriptive name for assertion
- * failures.
+ * Dummy false bool with a descriptive name for assertion failures.
*/
static const constexpr bool
function_was_not_implemented_in_the_selected_backend = false;
@@ -88,25 +86,27 @@ namespace grb {
public :
/**
- * Pins a given \a vector to a single memory pointer. The pointer
- * shall remain valid as long as the lifetime of this instance.
- * The given \a vector must be in unpinned state or an exception
- * will be thrown.
- * Pinning may or may not require a memory copy, depending on the
- * GraphBLAS implementation. If it does not, then destroying this
- * instance or calling #free on this vector may or may not result
- * in memory deallocation, depending on whether the underlying
- * vector still exists or not.
+ * Pins the contents of a given \a vector.
+ *
+ * A successfully constructed #grb::PinnedVector shall remain valid until it
+ * is destroyed, regardless of whether the ALP context in which the original
+ * \a vector appears has been destroyed.
+ *
+ * Pinning may or may not require a memory copy, depending on the ALP
+ * implementation and backend. If it does not, then destroying this
+ * instance \em may result in memory deallocation. It only \em must result
+ * in deallocation if the pinned vector that did not require a memory copy
+ * happens to be the last remaining reference to the original \a vector.
*
- * If one user process calls this constructor, \em all user
- * processes must do so-- this is a collective call. All member
- * functions as well as the default destructor are \em not
- * collective.
+ * If one user process calls this constructor, \em all user processes must do
+ * so and with the same arguments-- this is a collective call.
+ *
+ * All member functions of this class are \em not collective.
*
* @param[in] vector The vector to pin the memory of.
- * @param[in] mode The grb::IOMode.
+ * @param[in] mode The #grb::IOMode.
*
- * The \a mode argument is \em optional; its default is PARALLEL.
+ * The \a mode argument is \em optional. The default is #grb::PARALLEL.
*
* \parblock
* \par Performance semantics (#IOMode::SEQUENTIAL):
@@ -135,8 +135,8 @@ namespace grb {
const Vector< IOType, implementation, Coord > &vector,
const IOMode mode
) {
- (void)vector;
- (void)mode;
+ (void) vector;
+ (void) mode;
assert( function_was_not_implemented_in_the_selected_backend );
}
@@ -155,6 +155,8 @@ namespace grb {
}
/**
+ * Destroys a #grb::PinnedVector instance.
+ *
* Destroying a pinned vector will only remove the underlying vector data if
* and only if:
* 1) the original grb::Vector has been destroyed;
@@ -216,7 +218,7 @@ namespace grb {
* optional.
*
* A nonzero is a tuple of an index and nonzero value. A pinned vector holds
- * #nonzeroes() nonzeroes. Therefore, \a k must be less than #nonzeroes().
+ * #nonzeroes nonzeroes. Therefore, \a k must be less than #nonzeroes.
*
* @return The requested value.
*
@@ -231,7 +233,7 @@ namespace grb {
inline OutputType getNonzeroValue(
const size_t k, const OutputType one = OutputType()
) const noexcept {
- (void)k;
+ (void) k;
assert( function_was_not_implemented_in_the_selected_backend );
return one;
}
@@ -249,14 +251,13 @@ namespace grb {
* specification of #getNonzeroValue.
*
* \note By providing this variant, implementations may avoid the
- * requirement thatensure that that \a IOType must be default-
- * constructable.
+ * requirement that \a IOType must be default-constructable.
*/
inline IOType getNonzeroValue(
const size_t k
) const noexcept {
IOType ret;
- (void)k;
+ (void) k;
assert( function_was_not_implemented_in_the_selected_backend );
return ret;
}
@@ -267,7 +268,7 @@ namespace grb {
* @param[in] k The nonzero ID to return the index of.
*
* A nonzero is a tuple of an index and nonzero value. A pinned vector holds
- * #nonzeroes() nonzeroes. Therefore, \a k must be less than #nonzeroes().
+ * #nonzeroes nonzeroes. Therefore, \a k must be less than #nonzeroes.
*
* @return The requested index.
*
@@ -281,7 +282,7 @@ namespace grb {
inline size_t getNonzeroIndex(
const size_t k
) const noexcept {
- (void)k;
+ (void) k;
assert( function_was_not_implemented_in_the_selected_backend );
return std::numeric_limits< size_t >::max();
}
diff --git a/include/graphblas/base/properties.hpp b/include/graphblas/base/properties.hpp
index a1b497031..01a649203 100644
--- a/include/graphblas/base/properties.hpp
+++ b/include/graphblas/base/properties.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Provides a mechanism for inspecting properties of various backends
+ *
* @author A. N. Yzelman
* @date 5th of May 2017
*/
@@ -29,40 +33,71 @@
namespace grb {
/**
- * Collection of various properties on the given GraphBLAS backend.
+ * Collection of various properties on the given ALP/GraphBLAS \a backend.
*
- * @tparam implementation The implementation of which to access its properties.
+ * @tparam backend The backend of which to access its properties.
*
* The properties collected here are meant to be compile-time constants that
- * provide insight in what features the back-end supports.
+ * provide insight in what features the given \a backend supports. ALP user
+ * code may rely on the properties specified herein. All ALP backends must
+ * define all properties here specified.
+ *
+ * The default template class shall be empty in order to ensure implementing
+ * backends must specialise this class, while also making sure no backend may
+ * accidentally implicitly and erroneously propagate global defaults.
+ *
+ * \ingroup backends
*/
- template< enum Backend implementation >
+ template< enum Backend backend >
class Properties {
+#ifdef __DOXYGEN__
+
public:
/**
- * Whether a non-GraphBLAS object captured by a lambda-function and passed to
- * grb::eWiseLambda can be written to.
- *
- * If the implementation backend is fully Single Program, Multiple Data
- * (SPMD), then this is expected to be legal and result in user-process local
- * updates. This function would thus return \a true.
+ * Whether a scalar, non-ALP/GraphBLAS object, may be captured by and written
+ * to by a lambda function that is passed to #grb::eWiseLambda.
*
- * If the implementaiton backend is parallel but supports only a single user
- * processes, i.e., for a \em data-centric backend, writing to a shared
- * object results in race conditions and thus is technically impossible. This
- * function would thus return \a false.
+ * Typically, if the \a backend is shared-memory parallel, this function
+ * would return false . Purely Single Program, Multiple Data (SPMD)
+ * backends over distributed memory, including simple sequential backends,
+ * would have this property return true .
*
- * @return A boolean \a true if and only if capturing a non-GraphBLAS object
- * inside a lambda-function for write access, and passing it to
- * grb::eWiseLambda would yield valid user process local results. If
- * not, \a false is returned instead.
+ * Notably, hybrid SPMD + OpenMP backends (e.g., #grb::hybrid), are not pure
+ * SPMD and as such would return false .
*
* @see grb::eWiseLambda()
*/
- static constexpr bool writableCaptured = true;
+ static constexpr const bool writableCaptured = true;
+ /**
+ * Whether the given \a backend supports blocking execution or is, instead,
+ * non-blocking.
+ *
+ * In blocking execution mode, any ALP/GraphBLAS primitive, when it returns,
+ * is guaranteed to have completed the requested computation.
+ *
+ * If a given \a backend has this property true then the
+ * #isNonblockingExecution property must read false , and vice versa.
+ */
+ static constexpr const bool isBlockingExecution = true;
+
+ /**
+ * Whether the given \a backend is non-blocking or is, instead, blocking.
+ *
+ * In non-blocking execution mode, any ALP/GraphBLAS primitive, on return,
+ * \em may in fact \em not have completed the requested computation.
+ *
+ * Non-blocking execution thus allows for the lazy evaluation of an ALP
+ * code, which, in turn, allows for cross-primitive optimisations to be
+ * automatically applied.
+ *
+ * If a given \a backend has this property true then the
+ * #isBlockingExecution property must read false , and vice versa.
+ */
+ static constexpr const bool isNonblockingExecution = !isBlockingExecution;
+#endif
};
} // namespace grb
diff --git a/include/graphblas/base/spmd.hpp b/include/graphblas/base/spmd.hpp
index 49b95b90d..1955c8199 100644
--- a/include/graphblas/base/spmd.hpp
+++ b/include/graphblas/base/spmd.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Exposes facilities for direct SPMD programming
+ *
* @author A. N. Yzelman
* @date 28th of April, 2017
*/
@@ -32,52 +36,67 @@
#include "config.hpp"
+
namespace grb {
- /** \todo documentation */
+ /**
+ * For backends that support multiple user processes this class defines some
+ * basic primitives to support SPMD programming.
+ *
+ * All backends must implement this interface, including backends that do not
+ * support multiple user processes. The interface herein defined hence ensures
+ * to allow for trivial implementations for single user process backends.
+ */
template< Backend implementation >
class spmd {
- public:
-
- /** @return The number of user processes in this GraphBLAS run. */
- static inline size_t nprocs() noexcept {
- return 0;
- }
-
- /** @return The user process ID. */
- static inline size_t pid() noexcept {
- return SIZE_MAX;
- }
-
- /**
- * Calls a PlatformBSP \a bsp_sync.
- *
- * @param[in] msgs_in The maximum number of messages to be received across
- * \em all user processes. Default is zero.
- * @param[in] msgs_out The maximum number of messages to be sent across
- * \em all user processes. Default is zero.
- *
- * If both \a msgs_in and \a msgs_out are zero, the values will be
- * automatically inferred. This requires a second call to the PlatformBSP
- * \a bsp_sync primitive, thus increasing the latency by at least \f$ l \f$.
- *
- * If the values for \a msgs_in or \a msgs_out are underestimated, undefined
- * behaviour will occur. If this is not the case but one or more are instead
- * \a over estimated, this call will succeed as normal.
- *
- * @return grb::SUCCESS When all queued communication is executed succesfully.
- * @return grb::PANIC When an unrecoverable error occurs. When this value is
- * returned, the library enters an undefined state.
- */
- static enum RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
- (void)msgs_in;
- (void)msgs_out;
- return PANIC;
- }
+ public:
+
+ /** @return The number of user processes in this ALP run. */
+ static inline size_t nprocs() noexcept {
+ return 0;
+ }
+
+ /** @return The ID of this user process. */
+ static inline size_t pid() noexcept {
+ return SIZE_MAX;
+ }
+
+ /**
+ * \internal
+ * Provides functionalities similar to the LPF primitive \a lpf_sync,
+ * enhanced with zero-cost synchronisation semantics.
+ *
+ * @param[in] msgs_in The maximum number of messages to be received across
+ * \em all user processes. Default is zero.
+ * @param[in] msgs_out The maximum number of messages to be sent across
+ * \em all user processes. Default is zero.
+ *
+ * If both \a msgs_in and \a msgs_out are zero, the values will be
+ * automatically inferred. This requires a second call to the PlatformBSP
+ * \a bsp_sync primitive, thus increasing the latency by at least \f$ l \f$.
+ *
+ * If the values for \a msgs_in or \a msgs_out are underestimated, undefined
+ * behaviour will occur. If this is not the case but one or more are instead
+ * \a over estimated, this call will succeed as normal.
+ *
+ * @return grb::SUCCESS When all queued communication is executed succesfully.
+ * @return grb::PANIC When an unrecoverable error occurs. When this value is
+ * returned, the library enters an undefined state.
+ *
+ * \todo If exposing this API, there should also be exposed a mechanism for
+ * initiating messages.
+ * \endinternal
+ */
+ static enum RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept {
+ (void) msgs_in;
+ (void) msgs_out;
+ return PANIC;
+ }
}; // end class ``spmd''
} // namespace grb
#endif // end _H_GRB_BASE_SPMD
+
diff --git a/include/graphblas/base/vector.hpp b/include/graphblas/base/vector.hpp
index 3d3e2c2e5..c00ca6e53 100644
--- a/include/graphblas/base/vector.hpp
+++ b/include/graphblas/base/vector.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Specifies the ALP/GraphBLAS vector container.
+ *
* @author A. N. Yzelman
* @date 10th of August, 2016
*/
@@ -33,6 +37,7 @@
#include
#include
+
namespace grb {
/**
@@ -143,14 +148,14 @@ namespace grb {
public :
/** Standard equals operator. */
- bool operator==( const const_iterator & other ) const {
- (void)other;
+ bool operator==( const const_iterator &other ) const {
+ (void) other;
return false;
}
/** @returns The negation of operator==(). */
- bool operator!=( const const_iterator & other ) const {
- (void)other;
+ bool operator!=( const const_iterator &other ) const {
+ (void) other;
return true;
}
@@ -219,8 +224,8 @@ namespace grb {
* code sections.
*/
Vector( const size_t n, const size_t nz ) {
- (void)n;
- (void)nz;
+ (void) n;
+ (void) nz;
}
/**
@@ -228,7 +233,7 @@ namespace grb {
* above where \a nz is to taken equal to \a n.
*/
Vector( const size_t n ) {
- (void)n;
+ (void) n;
}
/**
@@ -252,7 +257,7 @@ namespace grb {
* \endparblock
*/
Vector( Vector< D, implementation, C > &&x ) noexcept {
- (void)x;
+ (void) x;
}
/**
@@ -270,8 +275,10 @@ namespace grb {
* -# this move assignment moves \f$ \Theta(1) \f$ data only.
* \endparblock
*/
- Vector< D, implementation, C >& operator=( Vector< D, implementation, C > &&x ) noexcept {
- (void)x;
+ Vector< D, implementation, C >& operator=(
+ Vector< D, implementation, C > &&x
+ ) noexcept {
+ (void) x;
return *this;
}
@@ -330,7 +337,10 @@ namespace grb {
* hence possibly causing its implicitly called constructor to
* allocate dynamic memory.
*/
- const_iterator cbegin() const {}
+ const_iterator cbegin() const {
+ const_iterator ret;
+ return ret;
+ }
/**
* Same as cbegin().
@@ -338,7 +348,11 @@ namespace grb {
* is no overloaded version of this function that returns a non-const
* iterator.
*/
- const_iterator begin() const {}
+ const_iterator begin() const {
+ const_iterator ret;
+ return ret;
+ }
+
//@}
//@{
@@ -363,7 +377,10 @@ namespace grb {
* specification disallows the same to happen for the construction of
* an iterator in end position.
*/
- const_iterator cend() const {}
+ const_iterator cend() const {
+ const_iterator ret;
+ return ret;
+ }
/**
* Same as cend().
@@ -371,7 +388,10 @@ namespace grb {
* is no overloaded version of this function that returns a non-const
* iterator.
*/
- const_iterator end() const {}
+ const_iterator end() const {
+ const_iterator ret;
+ return ret;
+ }
//@}
/**
@@ -457,12 +477,20 @@ namespace grb {
* @see grb::buildVector for the GraphBLAS standard dispatcher to this
* function.
*/
- template< Descriptor descr = descriptors::no_operation, class Accum = typename operators::right_assign< D, D, D >, typename fwd_iterator = const D * __restrict__ >
- RC build( const Accum & accum, const fwd_iterator start, const fwd_iterator end, fwd_iterator npos ) {
- (void)accum;
- (void)start;
- (void)end;
- (void)npos;
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Accum = typename operators::right_assign< D, D, D >,
+ typename fwd_iterator = const D * __restrict__
+ >
+ RC build(
+ const Accum &accum,
+ const fwd_iterator start, const fwd_iterator end,
+ fwd_iterator npos
+ ) {
+ (void) accum;
+ (void) start;
+ (void) end;
+ (void) npos;
return PANIC;
}
@@ -564,18 +592,25 @@ namespace grb {
* @see grb::buildVector for the GraphBLAS standard dispatcher to this
* function.
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
class Accum = operators::right_assign< D, D, D >,
typename ind_iterator = const size_t * __restrict__,
typename nnz_iterator = const D * __restrict__,
- class Dup = operators::right_assign< D, D, D > >
- RC build( const Accum & accum, const ind_iterator ind_start, const ind_iterator ind_end, const nnz_iterator nnz_start, const nnz_iterator nnz_end, const Dup & dup = Dup() ) {
- (void)accum;
- (void)ind_start;
- (void)ind_end;
- (void)nnz_start;
- (void)nnz_end;
- (void)dup;
+ class Dup = operators::right_assign< D, D, D >
+ >
+ RC build(
+ const Accum &accum,
+ const ind_iterator ind_start, const ind_iterator ind_end,
+ const nnz_iterator nnz_start, const nnz_iterator nnz_end,
+ const Dup &dup = Dup()
+ ) {
+ (void) accum;
+ (void) ind_start;
+ (void) ind_end;
+ (void) nnz_start;
+ (void) nnz_end;
+ (void) dup;
return PANIC;
}
@@ -682,26 +717,30 @@ namespace grb {
* @see grb::buildVector for the GraphBLAS standard dispatcher to this
* function.
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
typename mask_type,
class Accum,
typename ind_iterator = const size_t * __restrict__,
typename nnz_iterator = const D * __restrict__,
- class Dup = operators::right_assign< D, typename nnz_iterator::value_type, D > >
- RC build( const Vector< mask_type, implementation, C > mask,
- const Accum & accum,
+ class Dup = operators::right_assign< D, typename nnz_iterator::value_type, D >
+ >
+ RC build(
+ const Vector< mask_type, implementation, C > &mask,
+ const Accum &accum,
const ind_iterator ind_start,
const ind_iterator ind_end,
const nnz_iterator nnz_start,
const nnz_iterator nnz_end,
- const Dup & dup = Dup() ) {
- (void)mask;
- (void)accum;
- (void)ind_start;
- (void)ind_end;
- (void)nnz_start;
- (void)nnz_end;
- (void)dup;
+ const Dup &dup = Dup()
+ ) {
+ (void) mask;
+ (void) accum;
+ (void) ind_start;
+ (void) ind_end;
+ (void) nnz_start;
+ (void) nnz_end;
+ (void) dup;
return PANIC;
}
@@ -730,8 +769,8 @@ namespace grb {
* \endparblock
*/
template< typename T >
- RC size( T & size ) const {
- (void)size;
+ RC size( T &size ) const {
+ (void) size;
return PANIC;
}
@@ -760,8 +799,8 @@ namespace grb {
* \endparblock
*/
template< typename T >
- RC nnz( T & nnz ) const {
- (void)nnz;
+ RC nnz( T &nnz ) const {
+ (void) nnz;
return PANIC;
}
@@ -825,22 +864,36 @@ namespace grb {
* #lambda_reference.
*/
template< class Monoid >
- lambda_reference operator()( const size_t i, const Monoid & monoid = Monoid() ) {
- (void)i;
- (void)monoid;
+ lambda_reference operator()(
+ const size_t i, const Monoid &monoid = Monoid()
+ ) {
+ (void) i;
+ (void) monoid;
return PANIC;
}
/**
- * Returns a lambda reference to an element of this vector. The user
- * ensures that the requested reference only corresponds to a pre-existing
- * nonzero in this vector, or undefined behaviour will occur .
+ * Returns a lambda reference to an element of this vector.
+ *
+ * \warning This functionality may only be used within the body of a lambda
+ * function that is passed into #grb::eWiseLambda.
+ *
+ * The user must ensure that the requested reference only corresponds to a
+ * pre-existing nonzero in this vector.
+ *
+ * \warning Requesting a nonzero entry at a coordinate where no nonzero
+ * exists results in undefined behaviour.
*
* A lambda reference to an element of this vector is only valid when used
* inside a lambda function evaluated via grb::eWiseLambda. The lambda
- * function is called for specific indices only-- that is, the GraphBLAS
- * implementation decides at which elements to dereference this container.
- * Outside this scope the returned reference incurs undefined behaviour.
+ * function is called for specific indices only-- that is, ALP/GraphBLAS
+ * decides at which elements to dereference this container.
+ *
+ * If such a lambda function dereferences multiple vectors, then the sparsity
+ * structure of the first vector passed as an argument to #grb::eWiseLambda
+ * after the lambda function defines at which indices the vectors will be
+ * referenced. The user must ensure that all vectors dereferenced indeed have
+ * nonzeroes at every location this "leading vector" has a nonzero.
*
* \warning In particular, for the given index \a i by the lambda function,
* it shall be \em illegal to refer to indices relative to that
@@ -848,59 +901,32 @@ namespace grb {
* cetera.
*
* \note As a consequence, this function cannot be used to perform stencil
- * or halo based operations.
- *
- * If a previously non-existing entry of the vector is requested, undefined
- * behaviour will occur. Functions that are defined to work with references
- * of this kind, such as grb::eWiseLambda, define exactly which elements are
- * dereferenced.
- *
- * \warning In parallel contexts the use of a returned lambda reference
- * outside the context of an eWiseLambda will incur at least one of
- * the following ill effects: it may
- * -# fail outright,
- * -# work on stale data,
- * -# work on incorrect data, or
- * -# incur high communication costs to guarantee correctness.
- * In short, such usage causes undefined behaviour. Implementers are
- * \em not advised to provide GAS-like functionality through this
- * interface, as it invites bad programming practices and bad
- * algorithm design decisions. This operator is instead intended to
- * provide for generic BLAS1-type operations only.
+ * or halo type operations.
*
- * \note For I/O, use the iterator retrieved via cbegin() instead of
- * relying on a lambda_reference.
- *
- * @param[in] i Which element to return a lambda reference of.
- * @param[in] ring Under which generalised semiring to interpret the
- * requested \f$ i \f$th element of this vector.
+ * \note For I/O purposes, use the iterator retrieved via cbegin()
+ * instead of relying on a lambda_reference.
*
- * \note The \a ring is required to be able to interpret a sparse vector. A
- * user who is sure this vector is dense, or otherwise is able to
- * ensure that the a lambda_reference will only be requested at
- * elements where nonzeroes already exists, may refer to
- * Vector::operator[],
+ * @param[in] i Which element to return a lambda reference of.
*
* @return A lambda reference to the element \a i of this vector.
*
* \par Example.
- * See grb::eWiseLambda() for a practical and useful example.
- *
- * \warning There is no similar concept in the official GraphBLAS specs.
+ * See #grb::eWiseLambda for a practical and useful example.
*
* @see lambda_reference For more details on the returned reference type.
- * @see grb::eWiseLambda For one legal way in which to use the returned
- * #lambda_reference.
+ * @see #grb::eWiseLambda For one way to use the returned #lambda_reference.
*/
lambda_reference operator[]( const size_t i ) {
- (void)i;
- #ifndef _GRB_NO_EXCEPTIONS
- throw std::runtime_error( "Requesting lambda reference of unimplemented "
- "Vector backend." );
- #endif
+ (void) i;
+ #ifndef _GRB_NO_EXCEPTIONS
+ throw std::runtime_error(
+ "Requesting lambda reference of unimplemented Vector backend."
+ );
+ #endif
}
-}
-;
-}
+ };
+
+} // end namespace ``grb''
#endif // _H_GRB_VECTOR_BASE
+
diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp
index b0187a0b9..ccace7979 100644
--- a/include/graphblas/benchmark.hpp
+++ b/include/graphblas/benchmark.hpp
@@ -28,13 +28,19 @@
// include specialisations
#ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/benchmark.hpp"
+ #include "graphblas/reference/benchmark.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/benchmark.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/benchmark.hpp"
#endif
#ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/benchmark.hpp"
+ #include "graphblas/banshee/benchmark.hpp"
#endif
#ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/benchmark.hpp"
+ #include "graphblas/bsp1d/benchmark.hpp"
#endif
#ifdef _GRB_BACKEND
@@ -45,3 +51,4 @@ namespace grb {
#endif
#endif // end ``_H_GRB_BENCH''
+
diff --git a/include/graphblas/blas0.hpp b/include/graphblas/blas0.hpp
index b0967a322..751b2cf14 100644
--- a/include/graphblas/blas0.hpp
+++ b/include/graphblas/blas0.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Defines the ALP/GraphBLAS level-0 API
+ *
* @author A. N. Yzelman
* @date 5th of December 2016
*/
@@ -55,10 +59,12 @@
"************************************************************************" \
"**********************\n" );
+
namespace grb {
/**
- * \defgroup BLAS0 The Level-0 Basic Linear Algebra Subroutines (BLAS)
+ * \defgroup BLAS0 Level-0 Primitives
+ * \ingroup GraphBLAS
*
* A collection of functions that let GraphBLAS operators work on
* zero-dimensional containers, i.e., on scalars.
@@ -165,11 +171,13 @@ namespace grb {
* @see grb::operators::internal::Operator for a discussion on when foldr and
* foldl successfully generate in-place code.
*/
- template< Descriptor descr = descriptors::no_operation,
+ template<
+ Descriptor descr = descriptors::no_operation,
class OP,
typename InputType1, typename InputType2, typename OutputType
>
- static enum RC apply( OutputType &out,
+ static enum RC apply(
+ OutputType &out,
const InputType1 &x,
const InputType2 &y,
const OP &op = OP(),
@@ -178,7 +186,7 @@ namespace grb {
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
!grb::is_object< OutputType >::value,
- void >::type * = NULL
+ void >::type * = nullptr
) {
// static sanity check
NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || (
@@ -272,15 +280,26 @@ namespace grb {
* @see grb::operators::internal Operator for a discussion on fold-right
* capable operators and on stateful operators.
*/
- template< Descriptor descr = descriptors::no_operation, class OP, typename InputType, typename IOType >
- static RC foldr( const InputType & x,
- IOType & y,
- const OP & op = OP(),
- const typename std::enable_if< grb::is_operator< OP >::value && ! grb::is_object< InputType >::value && ! grb::is_object< IOType >::value, void >::type * = NULL ) {
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, typename InputType, typename IOType
+ >
+ static RC foldr(
+ const InputType &x,
+ IOType &y,
+ const OP &op = OP(),
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value, void
+ >::type * = nullptr
+ ) {
// static sanity check
- NO_CAST_ASSERT( ( ! ( descr & descriptors::no_casting ) ||
- ( std::is_same< InputType, typename OP::D1 >::value && std::is_same< IOType, typename OP::D2 >::value && std::is_same< IOType, typename OP::D3 >::value ) ),
- "grb::foldr (BLAS level 0)",
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || (
+ std::is_same< InputType, typename OP::D1 >::value &&
+ std::is_same< IOType, typename OP::D2 >::value &&
+ std::is_same< IOType, typename OP::D3 >::value
+ ) ), "grb::foldr (BLAS level 0)",
"Argument value types do not match operator domains while no_casting "
"descriptor was set" );
@@ -364,8 +383,13 @@ namespace grb {
* @see grb::operators::internal Operator for a discussion on fold-right
* capable operators and on stateful operators.
*/
- template< Descriptor descr = descriptors::no_operation, class OP, typename InputType, typename IOType >
- static RC foldl( IOType &x,
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename InputType, typename IOType
+ >
+ static RC foldl(
+ IOType &x,
const InputType &y,
const OP &op = OP(),
const typename std::enable_if< grb::is_operator< OP >::value &&
@@ -410,46 +434,89 @@ namespace grb {
* @tparam Enabled Controls, through SFINAE, whether the use of the
* #use_index descriptor is allowed at all.
*/
- template< grb::Descriptor descr, typename OutputType, typename D, typename Enabled = void >
+ template<
+ grb::Descriptor descr,
+ typename OutputType, typename D,
+ typename Enabled = void
+ >
class ValueOrIndex;
/* Version where use_index is allowed. */
template< grb::Descriptor descr, typename OutputType, typename D >
- class ValueOrIndex< descr, OutputType, D, typename std::enable_if< std::is_arithmetic< OutputType >::value && ! std::is_same< D, void >::value >::type > {
- private:
- static constexpr const bool use_index = descr & grb::descriptors::use_index;
- static_assert( use_index || std::is_convertible< D, OutputType >::value, "Cannot convert to the requested output type" );
-
- public:
- static OutputType getFromArray( const D * __restrict__ const x, const std::function< size_t( size_t ) > & src_local_to_global, const size_t index ) noexcept {
- if( use_index ) {
- return static_cast< OutputType >( src_local_to_global( index ) );
- } else {
- return static_cast< OutputType >( x[ index ] );
+ class ValueOrIndex<
+ descr,
+ OutputType, D,
+ typename std::enable_if<
+ std::is_arithmetic< OutputType >::value &&
+ !std::is_same< D, void >::value
+ >::type
+ > {
+
+ private:
+
+ static constexpr const bool use_index = descr & grb::descriptors::use_index;
+
+ static_assert( use_index || std::is_convertible< D, OutputType >::value,
+ "Cannot convert to the requested output type" );
+
+
+ public:
+
+ static OutputType getFromArray(
+ const D * __restrict__ const x,
+ const std::function< size_t( size_t ) > &src_local_to_global,
+ const size_t index
+ ) noexcept {
+ if( use_index ) {
+ return static_cast< OutputType >( src_local_to_global( index ) );
+ } else {
+ return static_cast< OutputType >( x[ index ] );
+ }
}
- }
- static OutputType getFromScalar( const D &x, const size_t index ) noexcept {
- if( use_index ) {
- return static_cast< OutputType >( index );
- } else {
- return static_cast< OutputType >( x );
+
+ static OutputType getFromScalar( const D &x, const size_t index ) noexcept {
+ if( use_index ) {
+ return static_cast< OutputType >( index );
+ } else {
+ return static_cast< OutputType >( x );
+ }
}
- }
+
};
/* Version where use_index is not allowed. */
template< grb::Descriptor descr, typename OutputType, typename D >
- class ValueOrIndex< descr, OutputType, D, typename std::enable_if< ! std::is_arithmetic< OutputType >::value && ! std::is_same< OutputType, void >::value >::type > {
- static_assert( ! ( descr & descriptors::use_index ), "use_index descriptor given while output type is not numeric" );
- static_assert( std::is_convertible< D, OutputType >::value, "Cannot convert input to the given output type" );
-
- public:
- static OutputType getFromArray( const D * __restrict__ const x, const std::function< size_t( size_t ) > &, const size_t index ) noexcept {
- return static_cast< OutputType >( x[ index ] );
- }
- static OutputType getFromScalar( const D &x, const size_t ) noexcept {
- return static_cast< OutputType >( x );
- }
+ class ValueOrIndex<
+ descr,
+ OutputType, D,
+ typename std::enable_if<
+ !std::is_arithmetic< OutputType >::value &&
+ !std::is_same< OutputType, void >::value
+ >::type
+ > {
+
+ static_assert( !(descr & descriptors::use_index),
+ "use_index descriptor given while output type is not numeric" );
+
+ static_assert( std::is_convertible< D, OutputType >::value,
+ "Cannot convert input to the given output type" );
+
+ public:
+
+ static OutputType getFromArray(
+ const D * __restrict__ const x,
+ const std::function< size_t( size_t ) > &,
+ const size_t index
+ ) noexcept {
+ return static_cast< OutputType >( x[ index ] );
+ }
+
+ static OutputType getFromScalar(
+ const D &x, const size_t
+ ) noexcept {
+ return static_cast< OutputType >( x );
+ }
+
};
/**
@@ -472,32 +539,69 @@ namespace grb {
* operator version is used instead.
*/
- template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity, typename Enabled = void >
+ template<
+ bool identity_left,
+ typename OutputType, typename InputType,
+ template< typename > class Identity,
+ typename Enabled = void
+ >
class CopyOrApplyWithIdentity;
/* The cast-and-assign version */
- template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity >
- class CopyOrApplyWithIdentity< identity_left, OutputType, InputType, Identity, typename std::enable_if< std::is_convertible< InputType, OutputType >::value >::type > {
- public:
- template< typename Operator >
- static void set( OutputType & out, const InputType & in, const Operator & ) {
- out = static_cast< OutputType >( in );
- }
+ template<
+ bool identity_left,
+ typename OutputType, typename InputType,
+ template< typename > class Identity
+ >
+ class CopyOrApplyWithIdentity<
+ identity_left,
+ OutputType, InputType,
+ Identity,
+ typename std::enable_if<
+ std::is_convertible< InputType, OutputType >::value
+ >::type
+ > {
+
+ public:
+
+ template< typename Operator >
+ static void set( OutputType &out, const InputType &in, const Operator & ) {
+ out = static_cast< OutputType >( in );
+ }
+
};
/* The operator with identity version */
- template< bool identity_left, typename OutputType, typename InputType, template< typename > class Identity >
- class CopyOrApplyWithIdentity< identity_left, OutputType, InputType, Identity, typename std::enable_if< ! std::is_convertible< InputType, OutputType >::value >::type > {
- public:
- template< typename Operator >
- static void set( OutputType & out, const InputType & in, const Operator & op ) {
- const auto identity = identity_left ? Identity< typename Operator::D1 >::value() : Identity< typename Operator::D2 >::value();
- if( identity_left ) {
- (void)grb::apply( out, identity, in, op );
- } else {
- (void)grb::apply( out, in, identity, op );
+ template<
+ bool identity_left,
+ typename OutputType, typename InputType,
+ template< typename > class Identity
+ >
+ class CopyOrApplyWithIdentity<
+ identity_left,
+ OutputType, InputType,
+ Identity,
+ typename std::enable_if<
+ !std::is_convertible< InputType, OutputType >::value
+ >::type
+ > {
+
+ public:
+
+ template< typename Operator >
+ static void set(
+ OutputType &out, const InputType &in, const Operator &op
+ ) {
+ const auto identity = identity_left ?
+ Identity< typename Operator::D1 >::value() :
+ Identity< typename Operator::D2 >::value();
+ if( identity_left ) {
+ (void) grb::apply( out, identity, in, op );
+ } else {
+ (void) grb::apply( out, in, identity, op );
+ }
}
- }
+
};
} // namespace internal
@@ -507,3 +611,4 @@ namespace grb {
#undef NO_CAST_ASSERT
#endif // end ``_H_GRB_BLAS0''
+
diff --git a/include/graphblas/blas1.hpp b/include/graphblas/blas1.hpp
index 9b796bee7..e28c9e8ad 100644
--- a/include/graphblas/blas1.hpp
+++ b/include/graphblas/blas1.hpp
@@ -28,6 +28,12 @@
#ifdef _GRB_WITH_REFERENCE
#include
#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas1.hpp"
+#endif
#ifdef _GRB_WITH_BANSHEE
#include
#endif
diff --git a/include/graphblas/blas2.hpp b/include/graphblas/blas2.hpp
index e44d311a1..2a0b1338e 100644
--- a/include/graphblas/blas2.hpp
+++ b/include/graphblas/blas2.hpp
@@ -33,6 +33,12 @@
#ifdef _GRB_WITH_REFERENCE
#include
#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas2.hpp"
+#endif
#ifdef _GRB_WITH_BANSHEE
#include
#endif
diff --git a/include/graphblas/blas3.hpp b/include/graphblas/blas3.hpp
index 3b485851f..6ed90264b 100644
--- a/include/graphblas/blas3.hpp
+++ b/include/graphblas/blas3.hpp
@@ -28,10 +28,17 @@
// now include all specialisations contained in the backend directories:
#ifdef _GRB_WITH_REFERENCE
-#include
+ #include
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/blas3.hpp"
#endif
#ifdef _GRB_WITH_LPF
-#include
+ #include
#endif
#endif // end _H_GRB_BLAS3
+
diff --git a/include/graphblas/bsp/collectives.hpp b/include/graphblas/bsp/collectives.hpp
index 6c1e28db3..098f7f738 100644
--- a/include/graphblas/bsp/collectives.hpp
+++ b/include/graphblas/bsp/collectives.hpp
@@ -98,14 +98,16 @@ namespace grb {
* This function may place an alloc of \f$ P\mathit{sizeof}(IOType) \f$ bytes
* if the internal buffer was not sufficiently large.
*/
- template< Descriptor descr = descriptors::no_operation, typename Operator, typename IOType >
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Operator, typename IOType
+ >
static RC allreduce( IOType &inout, const Operator &op = Operator() ) {
// this is the serial algorithm only
// TODO internal issue #19
#ifdef _DEBUG
- std::cout << "Entered grb::collectives< BSP1D >::allreduce with "
- "inout = "
- << inout << " and op = " << &op << std::endl;
+ std::cout << "Entered grb::collectives< BSP1D >::allreduce with inout = "
+ << inout << " and op = " << &op << std::endl;
#endif
// static sanity check
@@ -303,6 +305,17 @@ namespace grb {
* On output at non-root processes: the value at root.
*
* \parblock
+ * \par Performance semantics: common
+ * Whether system calls will happen depends on the LPF engine compiled with,
+ * as does whether buffer space is proportional to the payload size is
+ * required. In principle, when using a fabric like Inifiband and when using
+ * the LPF ibverbs engine, the intended IB zero-copy behaviour is attained.
+ *
+ * All below variants in any backend shall not result in dynamic memory
+ * allocations.
+ * \endparblock
+ *
+ * \parblock
* \par Performance semantics: serial
* -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
* -# local work: \f$ 0 \f$ ;
@@ -310,20 +323,21 @@ namespace grb {
* -# BSP cost: \f$ NPg + l \f$;
* \endparblock
*
- * \par Performance semantics: two hase
+ * \parblock
+ * \par Performance semantics: two phase
* -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
* -# local work: \f$ 0 \f$ ;
* -# transferred bytes: \f$ 2N \f$ ;
* -# BSP cost: \f$ 2(Ng + l) \f$;
* \endparblock
*
+ * \parblock
* \par Performance semantics: two level tree
* -# Problem size N: \f$ \mathit{sizeof}(\mathit{IOType}) \f$
* -# local work: \f$ 0 \f$ ;
* -# transferred bytes: \f$ 2\sqrt{P}N \f$ ;
* -# BSP cost: \f$ 2(\sqrt{P}Ng + l) \f$;
* \endparblock
- *
*/
template< typename IOType >
static RC broadcast( IOType & inout, const lpf_pid_t root = 0 ) {
diff --git a/include/graphblas/bsp/config.hpp b/include/graphblas/bsp/config.hpp
index 907dd659e..5672673e8 100644
--- a/include/graphblas/bsp/config.hpp
+++ b/include/graphblas/bsp/config.hpp
@@ -27,30 +27,38 @@
#include
+
namespace grb {
+
namespace config {
/**
* Lightweight Parallel Foundations defaults.
*/
class LPF {
- public:
- /**
- * Return the default number of memory registrations used by GraphBLAS.
- */
- static constexpr size_t regs() {
- return 500;
- }
-
- /**
- * Return the default maximum h relation expressed in the number of messages
- * (instead of bytes) used by GraphBLAS.
- */
- static constexpr size_t maxh() {
- return 200;
- }
+
+ public:
+
+ /**
+ * Return the default number of memory registrations used by GraphBLAS.
+ */
+ static constexpr size_t regs() {
+ return 500;
+ }
+
+ /**
+ * Return the default maximum h relation expressed in the number of messages
+ * (instead of bytes) used by GraphBLAS.
+ */
+ static constexpr size_t maxh() {
+ return 200;
+ }
+
};
+
} // namespace config
+
} // namespace grb
#endif
+
diff --git a/include/graphblas/bsp1d/benchmark.hpp b/include/graphblas/bsp1d/benchmark.hpp
index c88e83f67..2fdb91ac5 100644
--- a/include/graphblas/bsp1d/benchmark.hpp
+++ b/include/graphblas/bsp1d/benchmark.hpp
@@ -31,6 +31,7 @@
#include "exec.hpp"
+
namespace grb {
namespace internal {
@@ -332,15 +333,23 @@ namespace grb {
public:
- Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) : Launcher< FROM_MPI, BSP1D >( comm ) {}
+ Benchmarker( const MPI_Comm comm = MPI_COMM_WORLD ) :
+ Launcher< FROM_MPI, BSP1D >( comm )
+ {}
template< typename U >
- RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+ RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
const void * data_in, const size_t in_size,
U &data_out,
const size_t inner, const size_t outer,
const bool broadcast = false
) const {
+ // check arguments
+ if( in_size > 0 && data_in == nullptr ) {
+ return ILLEGAL;
+ }
+
// prepare packed input
struct internal::packedBenchmarkerInput input;
input.blob = data_in;
@@ -354,7 +363,8 @@ namespace grb {
lpf_args_t args;
fargs[ 0 ] = reinterpret_cast< lpf_func_t >( benchmark< U > );
fargs[ 1 ] = reinterpret_cast< lpf_func_t >( grb_program );
- args = { &input, sizeof( struct internal::packedBenchmarkerInput ),
+ args = {
+ &input, sizeof( struct internal::packedBenchmarkerInput ),
&data_out, sizeof( U ),
fargs, 2
};
@@ -373,8 +383,9 @@ namespace grb {
}
template< typename T, typename U >
- RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T & data_in, U &data_out, // input & output data
+ RC exec(
+ void ( *grb_program )( const T &, U & ), // user program
+ const T &data_in, U &data_out, // input & output data
const size_t inner, const size_t outer,
const bool broadcast = false
) {
@@ -420,7 +431,8 @@ namespace grb {
public:
- Benchmarker( const size_t process_id = 0, // user process ID
+ Benchmarker(
+ const size_t process_id = 0, // user process ID
const size_t nprocs = 1, // total number of user processes
const std::string hostname = "localhost", // one of the process' hostnames
const std::string port = "0", // a free port at hostname
@@ -430,12 +442,18 @@ namespace grb {
) {}
template< typename U >
- enum RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+ enum RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
const void * data_in, const size_t in_size,
U &data_out,
const size_t inner, const size_t outer,
const bool broadcast = false
) const {
+ // check input arguments
+ if( in_size > 0 && data_in == nullptr ) {
+ return ILLEGAL;
+ }
+
// prepare packed input
struct internal::packedBenchmarkerInput input;
input.blob = data_in;
diff --git a/include/graphblas/bsp1d/blas1.hpp b/include/graphblas/bsp1d/blas1.hpp
index 51d25a96e..7455a4679 100644
--- a/include/graphblas/bsp1d/blas1.hpp
+++ b/include/graphblas/bsp1d/blas1.hpp
@@ -289,6 +289,52 @@ namespace grb {
return foldl< descr >( x, y, empty_mask, monoid );
}
+ /** No implementation notes. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator,
+ typename IOType, typename Coords, typename InputType
+ >
+ RC foldr(
+ const InputType &alpha,
+ Vector< IOType, BSP1D, Coords > &y,
+ const Operator &op,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< InputType >::value &&
+ grb::is_operator< Operator >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Operator::D1 >::value ), "grb::foldl",
+ "called with an input vector value type that does not match the first "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Operator::D2 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the second domain of "
+ "the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Operator::D3 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the third domain of "
+ "the given operator" );
+
+ // dynamic checks
+ const size_t n = size( y );
+ if( (descr & descriptors::dense) ) {
+ if( nnz( y ) < n ) {
+ return ILLEGAL;
+ }
+ }
+
+ // nonzero structure remains unchanged, so just dispatch
+ RC ret = foldr< descr >( alpha, internal::getLocal( y ), op, phase );
+ assert( ret == SUCCESS );
+ if( ret != SUCCESS ) {
+ ret = PANIC;
+ }
+ return ret;
+ }
+
/** \internal No implementation notes. */
template<
Descriptor descr = descriptors::no_operation, class Monoid,
@@ -329,6 +375,7 @@ namespace grb {
) {
return SUCCESS;
}
+
// simply delegate to reference implementation will yield correct result
RC ret = foldr< descr >( alpha, internal::getLocal( y ), monoid, phase );
if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
@@ -347,7 +394,7 @@ namespace grb {
ret == FAILED
) {
const RC subrc = internal::updateNnz( y );
- if( subrc != SUCCESS ) { ret = FAILED; }
+ if( subrc != SUCCESS ) { ret = PANIC; }
}
}
@@ -373,7 +420,7 @@ namespace grb {
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< IOType, typename Operator::D2 >::value ), "grb::foldr",
"called with an I/O value type that does not match the second domain of "
- "the given operator " );
+ "the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType, typename Operator::D1 >::value ), "grb::foldr",
"called with an input vector value type that does not match the first "
@@ -429,7 +476,7 @@ namespace grb {
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
"called with an I/O value type that does not match the first domain of "
- "the given operator " );
+ "the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
"called with an input vector value type that does not match the second "
@@ -440,9 +487,10 @@ namespace grb {
"the given operator" );
// dynamic checks
- if( nnz( x ) < size( x ) ) {
- // note: this illegal no matter whether the dense descriptor is given
- return ILLEGAL;
+ if( descr & descriptors::dense ) {
+ if( nnz( x ) < size( x ) ) {
+ return ILLEGAL;
+ }
}
// nonzero structure remains unchanged, so just dispatch
@@ -456,7 +504,8 @@ namespace grb {
/** No implementation notes. */
template<
- Descriptor descr = descriptors::no_operation, class Monoid,
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
typename IOType, typename Coords, typename InputType
>
RC foldl(
@@ -520,6 +569,166 @@ namespace grb {
return ret;
}
+ /** No implementation notes. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator,
+ typename IOType, typename MaskType,
+ typename Coords, typename InputType
+ >
+ RC foldl(
+ Vector< IOType, BSP1D, Coords > &x,
+ Vector< MaskType, BSP1D, Coords > &mask,
+ const InputType &beta,
+ const Operator &op,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< InputType >::value &&
+ grb::is_operator< Operator >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the first domain of "
+ "the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
+ "called with an input vector value type that does not match the second "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Operator::D3 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the third domain of "
+ "the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ), "grb::foldl",
+ "called with a mask value type that is not Boolean" );
+
+ // check trivial dispatch
+ if( size( mask ) == 0 ) {
+ return foldl< descr >( x, beta, op, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( x );
+ if( size( mask ) != n ) {
+ return MISMATCH;
+ }
+ if( (descr & descriptors::dense) ) {
+ if( nnz( x ) < n ) {
+ return ILLEGAL;
+ }
+ if( nnz( mask ) < n ) {
+ return ILLEGAL;
+ }
+ }
+
+ // nonzero structure remains unchanged, so just dispatch
+ RC ret = foldl< descr >( internal::getLocal( x ), internal::getLocal( mask ),
+ beta, op, phase );
+ assert( ret == SUCCESS );
+ if( ret != SUCCESS ) {
+ ret = PANIC;
+ }
+ return ret;
+ }
+
+ /** No implementation notes. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType, typename MaskType,
+ typename Coords, typename InputType
+ >
+ RC foldl(
+ Vector< IOType, BSP1D, Coords > &x,
+ Vector< MaskType, BSP1D, Coords > &mask,
+ const InputType &beta,
+ const Monoid &monoid,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Monoid::D1 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the first domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl",
+ "called with an input vector value type that does not match the second "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, typename Monoid::D3 >::value ), "grb::foldl",
+ "called with an I/O value type that does not match the third domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ), "grb::foldl",
+ "called with a mask value type that is not Boolean" );
+
+ // check trivial dispatch
+ if( size( mask ) == 0 ) {
+ return foldl< descr >( x, beta, monoid, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( x );
+ if( size( mask ) != n ) {
+ return MISMATCH;
+ }
+ if( descr & descriptors::dense ) {
+ if( nnz( x ) < n ) {
+ return ILLEGAL;
+ }
+ if( nnz( mask ) < n ) {
+ return ILLEGAL;
+ }
+ }
+
+ // check for trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = foldl< descr >( internal::getLocal( x ), internal::getLocal( mask ),
+ beta, monoid, phase );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle try and execute
+ if( phase != RESIZE ) {
+ assert( phase == EXECUTE || phase == TRY );
+ if( ret == SUCCESS ) {
+ if( nnz( mask ) == n &&
+ (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask)
+ ) {
+ internal::setDense( x );
+ } else if( nnz( mask ) == 0 && (descr & descriptors::invert_mask) ) {
+ internal::setDense( x );
+ } else {
+ const RC subrc = internal::updateNnz( x );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ } else if( ret == FAILED ) {
+ assert( phase == TRY );
+ const RC subrc = internal::updateNnz( x );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ }
+
+ // done
+ return ret;
+ }
+
/**
* \internal Number of nonzeroes in \a x cannot change, hence no
* synchronisation required.
@@ -542,7 +751,7 @@ namespace grb {
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< IOType, typename Operator::D1 >::value ), "grb::foldl",
"called with an I/O value type that does not match the first domain of "
- "the given operator " );
+ "the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType, typename Operator::D2 >::value ), "grb::foldl",
"called with an input vector value type that does not match the second "
@@ -657,57 +866,57 @@ namespace grb {
return ret;
}
- /** \internal No communication necessary, output is guaranteed dense. */
+ /** \internal No implementation notes */
template<
- Descriptor descr = descriptors::no_operation,
- class Operator,
- typename OutputType, typename InputType1, typename InputType2,
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename IOType, typename MaskType, typename InputType,
typename Coords
>
- RC eWiseApply(
- Vector< OutputType, BSP1D, Coords > &z,
- const Vector< InputType1, BSP1D, Coords > &x,
- const InputType2 beta,
- const Operator &op,
+ RC foldl(
+ Vector< IOType, BSP1D, Coords > &x,
+ const Vector< MaskType, BSP1D, Coords > &m,
+ const Vector< InputType, BSP1D, Coords > &y,
+ const OP &op = OP(),
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
- !grb::is_object< InputType1 >::value &&
- !grb::is_object< InputType2 >::value &&
- grb::is_operator< Operator >::value, void
- >::type * const = nullptr
+ const typename std::enable_if< grb::is_operator< OP >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
) {
-#ifdef _DEBUG
- std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
- "[T1]<-[T2]<-T3\n";
-#endif
-
- // static checks
+ // static sanity checks
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
- std::is_same< InputType1, typename Operator::D1 >::value ),
- "grb::eWiseApply",
- "called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ std::is_same< typename OP::D1, IOType >::value ),
+ "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
- std::is_same< InputType2, typename Operator::D2 >::value ),
- "grb::eWiseApply",
- "called with a right-hand input vector value type that does not match the second "
- "domain of the given operator" );
+ std::is_same< typename OP::D2, InputType >::value ),
+ "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
- std::is_same< OutputType, typename Operator::D3 >::value ),
- "grb::eWiseApply",
- "called with an output value type that does not match the third domain of "
- "the given operator" );
+ std::is_same< typename OP::D3, IOType >::value ),
+ "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::foldl",
+ "called with a mask that does not have boolean entries " );
- // dynamic checks
- const size_t n = size( z );
- if( size( x ) != n ) {
- return MISMATCH;
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, y, op, phase );
}
- if( nnz( x ) < n ) {
- return ILLEGAL;
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
}
- // catch trivial resize
+ // handle trivial resize phase
if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
phase == RESIZE
) {
@@ -715,8 +924,12 @@ namespace grb {
}
// delegate
- RC ret = eWiseApply< descr >( internal::getLocal( z ),
- internal::getLocal( x ), beta, op, phase );
+ RC ret = foldl< descr >(
+ internal::getLocal( x ), internal::getLocal( m ),
+ internal::getLocal( y ),
+ op, phase
+ );
+
if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
if( collectives< BSP1D >::allreduce(
ret, grb::operators::any_or< RC >()
@@ -725,20 +938,12 @@ namespace grb {
}
}
- // handle try and execute
- if( phase == TRY ) {
- if( ret == SUCCESS || ret == FAILED ) {
- const RC subrc = internal::updateNnz( z );
- if( subrc != SUCCESS ) {
- if( ret == SUCCESS ) { ret = subrc; }
- else { ret = PANIC; }
- }
- }
- } else if( phase == EXECUTE ) {
+ // handle try and execute phases
+ if( phase != RESIZE ) {
if( ret == SUCCESS ) {
- internal::setDense( z );
+ ret = internal::updateNnz( x );
} else if( ret == FAILED ) {
- const RC subrc = internal::updateNnz( z );
+ const RC subrc = internal::updateNnz( x );
if( subrc != SUCCESS ) { ret = PANIC; }
}
}
@@ -747,7 +952,473 @@ namespace grb {
return ret;
}
- /** \internal No communication necessary, output is guaranteed dense. */
+ /** \internal No implementation notes */
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, BSP1D, Coords > &x,
+ const Vector< MaskType, BSP1D, Coords > &m,
+ const Vector< InputType, BSP1D, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ),
+ "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ),
+ "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ),
+ "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::foldl",
+ "called with a mask that does not have boolean entries" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, y, monoid, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
+ }
+
+ // handle trivial resize phase
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = foldl< descr >(
+ internal::getLocal( x ), internal::getLocal( m ),
+ internal::getLocal( y ),
+ monoid, phase
+ );
+
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle try and execute phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ ret = internal::updateNnz( x );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( x );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ }
+
+ // done
+ return ret;
+ }
+
+ /** \internal No communication necessary, output is guaranteed dense. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Operator &op,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
+ "[T1]<-T2<-T3\n";
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename Operator::D1 >::value ),
+ "grb::eWiseApply",
+ "called with a left-hand input scalar type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename Operator::D2 >::value ),
+ "grb::eWiseApply",
+ "called with a right-hand input scalar type that does not match the second "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename Operator::D3 >::value ),
+ "grb::eWiseApply",
+ "called with an output value type that does not match the third domain of "
+ "the given operator" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( (descr & descriptors::dense) && nnz( z ) != n ) {
+ return ILLEGAL;
+ }
+ if( capacity( z ) < n && phase == EXECUTE ) {
+ return FAILED;
+ }
+
+ // catch trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate to set
+ OutputType temp;
+ RC ret = apply< descr >( temp, alpha, beta, op );
+ ret = ret ? ret : set< descr >( z, temp, phase );
+
+ // done
+ return ret;
+ }
+
+ /** \internal Delegates to masked set. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Operator &op,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "In BSP1D masked eWiseApply (operator-based), "
+ "[T1]<-T2<-T3\n";
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename Operator::D1 >::value ),
+ "grb::eWiseApply",
+ "called with a left-hand input scalar type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename Operator::D2 >::value ),
+ "grb::eWiseApply",
+ "called with a right-hand input scalar type that does not match the second "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename Operator::D3 >::value ),
+ "grb::eWiseApply",
+ "called with an output value type that does not match the third domain of "
+ "the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask value type that is not bool" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( (descr & descriptors::dense) && nnz( mask ) != n ) {
+ return ILLEGAL;
+ }
+ if( (descr & descriptors::dense) && nnz( z ) != n ) {
+ return ILLEGAL;
+ }
+ if( size( mask ) != n ) {
+ return MISMATCH;
+ }
+ if( capacity( z ) < n && phase == EXECUTE ) {
+ return FAILED;
+ }
+
+ // catch trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate to set
+ OutputType temp;
+ RC ret = apply< descr >( temp, alpha, beta, op );
+ ret = ret ? ret : set< descr >( z, mask, temp, phase );
+
+ // done
+ return ret;
+ }
+
+ /** \internal No communication necessary, output is guaranteed dense. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "In BSP1D unmasked eWiseApply (monoid-based), "
+ "[T1]<-T2<-T3\n";
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename Monoid::D1 >::value ),
+ "grb::eWiseApply",
+ "called with a left-hand input scalar type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename Monoid::D2 >::value ),
+ "grb::eWiseApply",
+ "called with a right-hand input scalar type that does not match the second "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename Monoid::D3 >::value ),
+ "grb::eWiseApply",
+ "called with an output value type that does not match the third domain of "
+ "the given monoid" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( (descr & descriptors::dense) && nnz( z ) != n ) {
+ return ILLEGAL;
+ }
+ if( capacity( z ) < n && phase == EXECUTE ) {
+ return FAILED;
+ }
+
+ // catch trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate to set
+ OutputType temp;
+ RC ret = apply< descr >( temp, alpha, beta, monoid.getOperator() );
+ ret = ret ? ret : set< descr >( z, temp, phase );
+
+ // done
+ return ret;
+ }
+
+ /** \internal Delegates to masked set. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "In BSP1D masked eWiseApply (monoid-based), "
+ "[T1]<-T2<-T3\n";
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename Monoid::D1 >::value ),
+ "grb::eWiseApply",
+ "called with a left-hand input scalar type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename Monoid::D2 >::value ),
+ "grb::eWiseApply",
+ "called with a right-hand input scalar type that does not match the second "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename Monoid::D3 >::value ),
+ "grb::eWiseApply",
+ "called with an output value type that does not match the third domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask value type that is not bool" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( (descr & descriptors::dense) && nnz( mask ) != n ) {
+ return ILLEGAL;
+ }
+ if( (descr & descriptors::dense) && nnz( z ) != n ) {
+ return ILLEGAL;
+ }
+ if( size( mask ) != n ) {
+ return MISMATCH;
+ }
+ if( capacity( z ) < n && phase == EXECUTE ) {
+ return FAILED;
+ }
+
+ // catch trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate to set
+ OutputType temp;
+ RC ret = apply< descr >( temp, alpha, beta, monoid.getOperator() );
+ ret = ret ? ret : set< descr >( z, mask, temp, phase );
+
+ // done
+ return ret;
+ }
+
+ /** \internal No communication necessary, output is guaranteed dense. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< InputType1, BSP1D, Coords > &x,
+ const InputType2 beta,
+ const Operator &op,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
+ "[T1]<-[T2]<-T3\n";
+#endif
+
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename Operator::D1 >::value ),
+ "grb::eWiseApply",
+ "called with a left-hand input vector value type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename Operator::D2 >::value ),
+ "grb::eWiseApply",
+ "called with a right-hand input vector value type that does not match the second "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename Operator::D3 >::value ),
+ "grb::eWiseApply",
+ "called with an output value type that does not match the third domain of "
+ "the given operator" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( x ) != n ) {
+ return MISMATCH;
+ }
+ if( nnz( x ) < n ) {
+ return ILLEGAL;
+ }
+
+ // catch trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = eWiseApply< descr >( internal::getLocal( z ),
+ internal::getLocal( x ), beta, op, phase );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle try and execute
+ if( phase == TRY ) {
+ if( ret == SUCCESS || ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) {
+ if( ret == SUCCESS ) { ret = subrc; }
+ else { ret = PANIC; }
+ }
+ }
+ } else if( phase == EXECUTE ) {
+ if( ret == SUCCESS ) {
+ internal::setDense( z );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ }
+
+ // done
+ return ret;
+ }
+
+ /** \internal No communication necessary, output is guaranteed dense. */
template<
Descriptor descr = descriptors::no_operation,
class Operator,
@@ -760,22 +1431,22 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Operator &op,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
grb::is_operator< Operator >::value,
void >::type * const = nullptr
) {
#ifdef _DEBUG
- std::cerr << "In BSP1D unmasked eWiseApply (operator-based), "
- "[T1]<-T2<-[T3]\n";
+ std::cerr << "In BSP1D unmasked eWiseApply (operator-based), T1]<-T2<-[T3]\n";
#endif
// static checks
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType1, typename Operator::D1 >::value ),
"grb::eWiseApply",
"called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ "first domain of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType2, typename Operator::D2 >::value ),
"grb::eWiseApply",
@@ -852,7 +1523,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Operator &op,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
grb::is_operator< Operator >::value,
@@ -867,7 +1539,7 @@ namespace grb {
std::is_same< InputType1, typename Operator::D1 >::value ),
"grb::eWiseApply",
"called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ "first domain of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType2, typename Operator::D2 >::value ),
"grb::eWiseApply",
@@ -966,7 +1638,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Operator &op,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -982,7 +1655,7 @@ namespace grb {
std::is_same< InputType1, typename Operator::D1 >::value ),
"grb::eWiseApply",
"called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ "first domain of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType2, typename Operator::D2 >::value ),
"grb::eWiseApply",
@@ -1066,7 +1739,8 @@ namespace grb {
const InputType2 beta,
const Operator &op,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -1082,7 +1756,7 @@ namespace grb {
std::is_same< InputType1, typename Operator::D1 >::value ),
"grb::eWiseApply",
"called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ "first domain of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType2, typename Operator::D2 >::value ),
"grb::eWiseApply",
@@ -1167,7 +1841,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Operator &op,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -1183,7 +1858,7 @@ namespace grb {
std::is_same< InputType1, typename Operator::D1 >::value ),
"grb::eWiseApply",
"called with a left-hand input vector value type that does not match the "
- "first domain of the given operator " );
+ "first domain of the given operator" );
NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
std::is_same< InputType2, typename Operator::D2 >::value ),
"grb::eWiseApply",
@@ -1276,7 +1951,8 @@ namespace grb {
const InputType2 beta,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
grb::is_monoid< Monoid >::value,
@@ -1305,7 +1981,9 @@ namespace grb {
// check if can delegate to dense variant
const size_t n = size( z );
- if( (descr & descriptors::dense) || nnz( x ) == n ) {
+ if( (descr & descriptors::dense) || (
+ nnz( x ) == n && nnz( z ) == n
+ ) ) {
return eWiseApply< descr | descriptors::dense >(
z, x, beta, monoid.getOperator(), phase
);
@@ -1316,7 +1994,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( x ) < n ) {
+ if( nnz( x ) < n || nnz( z ) < n ) {
return ILLEGAL;
}
}
@@ -1365,7 +2043,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
grb::is_monoid< Monoid >::value,
@@ -1394,7 +2073,9 @@ namespace grb {
// check if can delegate to dense variant
const size_t n = size( z );
- if( (descr & descriptors::dense) || nnz( y ) == n ) {
+ if( (descr & descriptors::dense) || (
+ nnz( y ) == n && nnz( z ) == n
+ ) ) {
return eWiseApply< descr | descriptors::dense >(
z, alpha, y, monoid.getOperator(), phase
);
@@ -1405,7 +2086,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( y ) < n ) {
+ if( nnz( y ) < n || nnz( z ) < n ) {
return ILLEGAL;
}
}
@@ -1456,7 +2137,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
grb::is_monoid< Monoid >::value,
@@ -1485,7 +2167,9 @@ namespace grb {
// check if we can delegate to dense variant
const size_t n = size( z );
- if( (descr & descriptors::dense) || (nnz( x ) == n && nnz( y ) == n) ) {
+ if( (descr & descriptors::dense) || (
+ nnz( x ) == n && nnz( y ) == n && nnz( z ) == n
+ ) ) {
return eWiseApply< descr | descriptors::dense >(
z, x, y, monoid.getOperator(), phase
);
@@ -1556,7 +2240,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -1601,7 +2286,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( y ) < n || nnz( mask ) < n ) {
+ if( nnz( y ) < n || nnz( mask ) < n || nnz( z ) < n ) {
return ILLEGAL;
}
}
@@ -1656,7 +2341,8 @@ namespace grb {
const InputType2 beta,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -1707,6 +2393,9 @@ namespace grb {
if( nnz( x ) < n ) {
return ILLEGAL;
}
+ if( nnz ( z ) < n ) {
+ return ILLEGAL;
+ }
}
// handle trivial resize phase
@@ -1759,7 +2448,8 @@ namespace grb {
const Vector< InputType2, BSP1D, Coords > &y,
const Monoid &monoid,
const Phase &phase = EXECUTE,
- const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
!grb::is_object< MaskType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
@@ -1807,7 +2497,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( x ) < n || nnz( y ) < n ) {
+ if( nnz( x ) < n || nnz( y ) < n || nnz( z ) < n ) {
return ILLEGAL;
}
if( nnz( mask ) < n ) {
@@ -2220,94 +2910,406 @@ namespace grb {
);
}
- /**
- * \internal Does not require communication.
- *
- * \warning This function has been deprecated since version 0.5. If required,
- * consider instead a sequence of grb::foldl using the additive
- * monoid, followed by a call to grb::eWiseMul.
- */
+ /**
+ * \internal Does not require communication.
+ *
+ * \warning This function has been deprecated since version 0.5. If required,
+ * consider instead a sequence of grb::foldl using the additive
+ * monoid, followed by a call to grb::eWiseMul.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Vector< InputType3, BSP1D, Coords > & y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ const size_t n = grb::size( z );
+ if( n != grb::size( y ) ) {
+ return MISMATCH;
+ }
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+ return SUCCESS;
+ }
+ if( phase == RESIZE ) {
+ return resize( z, n );
+ }
+
+ assert( phase == EXECUTE );
+ internal::setDense( z );
+ return grb::eWiseMulAdd< descr >(
+ internal::getLocal( z ), alpha, beta, internal::getLocal( y ), ring
+ );
+ }
+
+ /**
+ * \internal Does not require communication.
+ *
+ * \warning This function has been deprecated since version 0.5. If required,
+ * consider instead a sequence of grb::foldl using the additive
+ * monoid, followed by a call to grb::eWiseMul.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+ return SUCCESS;
+ }
+ if( phase == RESIZE ) {
+ return resize( z, size( z ) );
+ }
+ assert( phase == EXECUTE );
+ internal::setDense( z );
+ return grb::eWiseMulAdd< descr >( internal::getLocal( z ), alpha, beta,
+ gamma, ring );
+ }
+
+ /** \internal Requires syncing of output nonzero count. */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< InputType1, BSP1D, Coords > &x,
+ const Vector< InputType2, BSP1D, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = grb::size( z );
+ if( n != grb::size( x ) ) {
+ return MISMATCH;
+ }
+ if( n != grb::size( y ) ) {
+ return MISMATCH;
+ }
+ if( descr & descriptors::dense ) {
+ if( nnz( z ) < n || nnz( x ) < n || nnz( y ) < n ) {
+ return ILLEGAL;
+ }
+ }
+
+ // handle trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = eWiseMul< descr >(
+ internal::getLocal( z ),
+ internal::getLocal( x ), internal::getLocal( y ),
+ ring, phase
+ );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle try and execute phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ ret = internal::updateNnz( z );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ }
+
+ // done
+ return ret;
+ }
+
+ /** \internal Requires syncing of output nonzero count. */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, BSP1D, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = grb::size( z );
+ if( n != grb::size( y ) ) {
+ return MISMATCH;
+ }
+ if( descr & descriptors::dense ) {
+ if( nnz( z ) < n || nnz( y ) < n ) {
+ return ILLEGAL;
+ }
+ }
+
+ // handle trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = eWiseMul< descr >( internal::getLocal( z ), alpha,
+ internal::getLocal( y ), ring, phase );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle execute and try phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ ret = internal::updateNnz( z );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) { ret = PANIC; }
+ }
+ }
+
+ // done
+ return ret;
+ }
+
+ /** \internal Requires syncing of output nonzero count. */
template<
Descriptor descr = descriptors::no_operation, class Ring,
- typename InputType1, typename InputType2, typename InputType3,
- typename OutputType, typename Coords
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
>
- RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
- const InputType1 alpha,
+ RC eWiseMul(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< InputType1, BSP1D, Coords > &x,
const InputType2 beta,
- const Vector< InputType3, BSP1D, Coords > & y,
const Ring &ring = Ring(),
const Phase &phase = EXECUTE,
const typename std::enable_if< !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
- !grb::is_object< InputType3 >::value &&
grb::is_semiring< Ring >::value, void
>::type * const = nullptr
) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
const size_t n = grb::size( z );
- if( n != grb::size( y ) ) {
+ if( n != grb::size( x ) ) {
return MISMATCH;
}
- if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+ if( descr & descriptors::dense ) {
+ if( nnz( z ) < n ) { return ILLEGAL; }
+ if( nnz( x ) < n ) { return ILLEGAL; }
+ }
+
+ // handle trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
return SUCCESS;
}
- if( phase == RESIZE ) {
- return resize( z, n );
+
+ // delegate
+ RC ret = eWiseMul< descr >( internal::getLocal( z ),
+ internal::getLocal( x ), beta, ring, phase );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
}
- assert( phase == EXECUTE );
- internal::setDense( z );
- return grb::eWiseMulAdd< descr >(
- internal::getLocal( z ), alpha, beta, internal::getLocal( y ), ring
- );
+ // handle try and execute phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ ret = internal::updateNnz( z );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) { ret = FAILED; }
+ }
+ }
+
+ // done
+ return ret;
}
- /**
- * \internal Does not require communication.
- *
- * \warning This function has been deprecated since version 0.5. If required,
- * consider instead a sequence of grb::foldl using the additive
- * monoid, followed by a call to grb::eWiseMul.
- */
+ /** \internal no implementation details */
template<
- Descriptor descr = descriptors::no_operation, class Ring,
- typename InputType1, typename InputType2, typename InputType3,
- typename OutputType, typename Coords
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
>
- RC eWiseMulAdd( Vector< OutputType, BSP1D, Coords > &z,
+ RC eWiseMul(
+ Vector< OutputType, BSP1D, Coords > &z,
const InputType1 alpha,
const InputType2 beta,
- const InputType3 gamma,
const Ring &ring = Ring(),
const Phase &phase = EXECUTE,
const typename std::enable_if< !grb::is_object< OutputType >::value &&
!grb::is_object< InputType1 >::value &&
!grb::is_object< InputType2 >::value &&
- !grb::is_object< InputType3 >::value &&
- grb::is_semiring< Ring >::value,
- void >::type * const = nullptr
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
) {
- if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() && phase == RESIZE ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = grb::size( z );
+ if( descr & descriptors::dense ) {
+ if( nnz( z ) < n ) { return ILLEGAL; }
+ }
+
+ // handle trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
return SUCCESS;
}
- if( phase == RESIZE ) {
- return resize( z, size( z ) );
+
+ // delegate
+ RC ret = eWiseMul< descr >( internal::getLocal( z ),
+ alpha, beta, ring, phase );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
}
- assert( phase == EXECUTE );
- internal::setDense( z );
- return grb::eWiseMulAdd< descr >( internal::getLocal( z ), alpha, beta,
- gamma, ring );
+
+ // handle try and execute phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ internal::setDense( z );
+ }
+ }
+
+ // done
+ return ret;
}
/** \internal Requires syncing of output nonzero count. */
template<
- Descriptor descr = descriptors::no_operation, class Ring,
+ Descriptor descr = descriptors::no_operation,
+ class Ring, typename MaskType,
typename InputType1, typename InputType2, typename OutputType,
typename Coords
>
RC eWiseMul(
Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &m,
const Vector< InputType1, BSP1D, Coords > &x,
const Vector< InputType2, BSP1D, Coords > &y,
const Ring &ring = Ring(),
@@ -2318,8 +3320,28 @@ namespace grb {
grb::is_semiring< Ring >::value, void
>::type * const = nullptr
) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseMulAdd",
+ "called with a mask vector with a non-bool element type" );
+
// dynamic checks
const size_t n = grb::size( z );
+ if( n != grb::size( m ) ) {
+ return MISMATCH;
+ }
if( n != grb::size( x ) ) {
return MISMATCH;
}
@@ -2327,7 +3349,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( x ) < n || nnz( y ) < n ) {
+ if( nnz( z ) < n || nnz( m ) < n || nnz( x ) < n || nnz( y ) < n ) {
return ILLEGAL;
}
}
@@ -2341,7 +3363,7 @@ namespace grb {
// delegate
RC ret = eWiseMul< descr >(
- internal::getLocal( z ),
+ internal::getLocal( z ), internal::getLocal( m ),
internal::getLocal( x ), internal::getLocal( y ),
ring, phase
);
@@ -2369,12 +3391,14 @@ namespace grb {
/** \internal Requires syncing of output nonzero count. */
template<
- Descriptor descr = descriptors::no_operation, class Ring,
+ Descriptor descr = descriptors::no_operation,
+ class Ring, typename MaskType,
typename InputType1, typename InputType2, typename OutputType,
typename Coords
>
RC eWiseMul(
Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &m,
const InputType1 alpha,
const Vector< InputType2, BSP1D, Coords > &y,
const Ring &ring = Ring(),
@@ -2385,13 +3409,39 @@ namespace grb {
grb::is_semiring< Ring >::value, void
>::type * const = nullptr
) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, y, ring, phase );
+ }
+
// dynamic checks
- const size_t n = grb::size( z );
- if( n != grb::size( y ) ) {
+ const size_t n = size( z );
+ if( n != size( m ) || n != size( y ) ) {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( y ) < n ) {
+ if( nnz( z ) < n || nnz( y ) < n ) {
return ILLEGAL;
}
}
@@ -2404,8 +3454,11 @@ namespace grb {
}
// delegate
- RC ret = eWiseMul< descr >( internal::getLocal( z ), alpha,
- internal::getLocal( y ), ring, phase );
+ RC ret = eWiseMul< descr >(
+ internal::getLocal( z ), internal::getLocal( m ),
+ alpha, internal::getLocal( y ),
+ ring, phase
+ );
if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
if( collectives< BSP1D >::allreduce(
ret, grb::operators::any_or< RC >()
@@ -2430,12 +3483,14 @@ namespace grb {
/** \internal Requires syncing of output nonzero count. */
template<
- Descriptor descr = descriptors::no_operation, class Ring,
+ Descriptor descr = descriptors::no_operation,
+ class Ring, typename MaskType,
typename InputType1, typename InputType2, typename OutputType,
typename Coords
>
RC eWiseMul(
Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &m,
const Vector< InputType1, BSP1D, Coords > &x,
const InputType2 beta,
const Ring &ring = Ring(),
@@ -2446,17 +3501,132 @@ namespace grb {
grb::is_semiring< Ring >::value, void
>::type * const = nullptr
) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, x, beta, ring, phase );
+ }
+
// dynamic checks
- const size_t n = grb::size( z );
- if( n != grb::size( x ) ) {
+ const size_t n = size( z );
+ if( n != size( m ) || n != size( x ) ) {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( x ) < n ) {
- return ILLEGAL;
+ if( nnz( z ) < n ) { return ILLEGAL; }
+ if( nnz( m ) < n ) { return ILLEGAL; }
+ if( nnz( x ) < n ) { return ILLEGAL; }
+ }
+
+ // handle trivial resize
+ if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
+ phase == RESIZE
+ ) {
+ return SUCCESS;
+ }
+
+ // delegate
+ RC ret = eWiseMul< descr >(
+ internal::getLocal( z ), internal::getLocal( m ),
+ internal::getLocal( x ), beta,
+ ring, phase
+ );
+ if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
+ if( collectives< BSP1D >::allreduce(
+ ret, grb::operators::any_or< RC >()
+ ) != SUCCESS ) {
+ return PANIC;
+ }
+ }
+
+ // handle try and execute phases
+ if( phase != RESIZE ) {
+ if( ret == SUCCESS ) {
+ ret = internal::updateNnz( z );
+ } else if( ret == FAILED ) {
+ const RC subrc = internal::updateNnz( z );
+ if( subrc != SUCCESS ) { ret = FAILED; }
}
}
+ // done
+ return ret;
+ }
+
+ /** \internal Requires syncing of output nonzero count. */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, typename MaskType,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, BSP1D, Coords > &z,
+ const Vector< MaskType, BSP1D, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, beta, ring, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( n != size( m ) ) { return MISMATCH; }
+ if( descr & descriptors::dense ) {
+ if( nnz( z ) < n ) { return ILLEGAL; }
+ if( nnz( m ) < n ) { return ILLEGAL; }
+ }
+
// handle trivial resize
if( config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() &&
phase == RESIZE
@@ -2465,8 +3635,11 @@ namespace grb {
}
// delegate
- RC ret = eWiseMul< descr >( internal::getLocal( z ),
- internal::getLocal( x ), beta, ring, phase );
+ RC ret = eWiseMul< descr >(
+ internal::getLocal( z ), internal::getLocal( m ),
+ alpha, beta,
+ ring, phase
+ );
if( !config::IMPLEMENTATION< BSP1D >::fixedVectorCapacities() ) {
if( collectives< BSP1D >::allreduce(
ret, grb::operators::any_or< RC >()
diff --git a/include/graphblas/bsp1d/blas2.hpp b/include/graphblas/bsp1d/blas2.hpp
index 7a0124bcc..42c5875d9 100644
--- a/include/graphblas/bsp1d/blas2.hpp
+++ b/include/graphblas/bsp1d/blas2.hpp
@@ -506,6 +506,39 @@ namespace grb {
}
}
+ /** \internal Dispatches to bsp1d_vxm or bsp1d_mxv */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring, typename Coords, typename RIT, typename CIT, typename NIT,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2,
+ typename InputType3 = bool
+ >
+ RC vxm(
+ Vector< IOType, BSP1D, Coords > &u,
+ const Vector< InputType3, BSP1D, Coords > &u_mask,
+ const Vector< InputType1, BSP1D, Coords > &v,
+ const Matrix< InputType2, BSP1D, RIT, CIT, NIT > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ const Vector< bool, BSP1D, Coords > empty_mask( 0 );
+ // transpose is delegated to mxv
+ if( descr & descriptors::transpose_matrix ) {
+ return internal::bsp1d_mxv<
+ descr & ~( descriptors::transpose_matrix ), true, false, true
+ >( u, u_mask, A, v, empty_mask, ring, phase );
+ } else {
+ return internal::bsp1d_vxm< descr, true, false, true >(
+ u, u_mask, v, empty_mask, A, ring, phase
+ );
+ }
+ }
+
/** \internal Dispatches to bsp1d_vxm or bsp1d_mxv */
template<
Descriptor descr = descriptors::no_operation,
diff --git a/include/graphblas/bsp1d/config.hpp b/include/graphblas/bsp1d/config.hpp
index 6134e2b58..12641135b 100644
--- a/include/graphblas/bsp1d/config.hpp
+++ b/include/graphblas/bsp1d/config.hpp
@@ -18,7 +18,7 @@
/**
* @file
*
- * Implements the various grb::config items for the grb::BSP1D backend.
+ * Contains the configuration parameters for the BSP1D backend
*
* @author A. N. Yzelman
* @date 5th of May, 2017
@@ -41,64 +41,95 @@
namespace grb {
- /**
- * \defgroup bsp1d The BSP1D backend implementation
- *
- * Groups all definitions and documentations corresponding to the #BSP1D
- * backend.
- * @{
- */
-
namespace config {
/**
- * Defaults for the BSP1D implementation
+ * \defgroup bsp1dConfig BSP1D backend configuration
+ * \ingroup config
+ *
+ * All configuration parameters for the #BSP1D and #hybrid backends.
+ *
+ * @{
+ */
+
+ /**
+ * This class collects configuration parameters that are specific to the
+ * #grb::BSP1D and #grb::hybrid backends.
+ *
+ * \note The full set of implementation details are only visible within the
+ * developer documentation.
+ *
+ * \ingroup bsp1d
*/
template<>
- class IMPLEMENTATION< grb::Backend::BSP1D > {
+ class IMPLEMENTATION< BSP1D > {
private:
/**
+ * \internal
* \a true if and only if \a mode was set. By default, value is \a false.
+ * \endinternal
*/
static bool set;
/**
+ * \internal
* The selected mode. Only set if \a set is \a true.
+ * \endinternal
*/
static grb::config::ALLOC_MODE mode;
- /** Attempts to automatically deduce the best value for \a mode. */
+ /**
+ * \internal
+ * Attempts to automatically deduce the best value for \a mode.
+ * \endinternal
+ */
static void deduce() noexcept;
public:
/**
- * For private memory segments, which is the default, simply choose aligned
- * allocations.
+ * @returns The default allocation strategy for private memory segments.
*/
static constexpr ALLOC_MODE defaultAllocMode() {
return grb::config::ALLOC_MODE::ALIGNED;
}
/**
+ * \internal
* Whether the backend has vector capacities always fixed to their
* defaults.
+ * \endinternal
*/
static constexpr bool fixedVectorCapacities() {
return IMPLEMENTATION< _GRB_BSP1D_BACKEND >::fixedVectorCapacities();
}
/**
- * For the BSP1D backend, a shared memory-segment should use interleaved
- * alloc only if is running one process per compute node.
+ * @returns The default allocation strategy for shared memory regions.
+ *
+ * By default, for the BSP1D backend, a shared memory-segment should use
+ * interleaved alloc only if is running one process per compute node. This
+ * implies a run-time component to this function, which is why for this
+ * backend this function is \em not constexpr .
+ *
+ * \warning This function does assume that the number of processes does not
+ * change over the life time of an ALP context.
+ *
+ * \note While the above may seem a reasonably safe assumption, the use of
+ * the launcher in #MANUAL mode may, in fact, make this a realistic
+ * issue that could be encountered. In such cases the deduction should
+ * be re-initiated. If you encounter this problem, please report it so
+ * that such a fix can be implemented.
*/
static grb::config::ALLOC_MODE sharedAllocMode() noexcept;
/**
+ * \internal
* Select the coordinates backend of the selected process-local backend.
+ * \endinternal
*/
static constexpr Backend coordinatesBackend() {
return IMPLEMENTATION< _GRB_BSP1D_BACKEND >::coordinatesBackend();
@@ -106,9 +137,9 @@ namespace grb {
};
- } // namespace config
+ /** @} */
- /** @} */
+ } // namespace config
} // namespace grb
diff --git a/include/graphblas/bsp1d/exec.hpp b/include/graphblas/bsp1d/exec.hpp
index b520ada6d..e8e627aa9 100644
--- a/include/graphblas/bsp1d/exec.hpp
+++ b/include/graphblas/bsp1d/exec.hpp
@@ -337,11 +337,17 @@ namespace grb {
/** No implementation notes. */
template< typename U >
- RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+ RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
const void * data_in, const size_t in_size,
U &data_out,
const bool broadcast = false
) const {
+ // check input arguments
+ if( in_size > 0 && data_in == nullptr ) {
+ return ILLEGAL;
+ }
+
// prepare args
lpf_func_t fargs[ 2 ];
lpf_args_t args;
@@ -364,8 +370,9 @@ namespace grb {
/** No implementation notes. */
template< typename T, typename U >
- RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T &data_in, U &data_out, // input & output data
+ RC exec(
+ void ( *grb_program )( const T &, U & ), // user GraphBLAS program
+ const T &data_in, U &data_out, // input & output data
const bool broadcast = false
) {
// prepare args
@@ -451,14 +458,13 @@ namespace grb {
* @throws runtime_error When the requested launcher group
* could not be created.
*/
- Launcher( const size_t process_id = 0, // user process ID
+ Launcher(
+ const size_t process_id = 0, // user process ID
const size_t nprocs = 1, // total number of user processes
const std::string hostname = "localhost", // one of the process' hostnames
const std::string port = "0", // a free port at hostname
const bool is_mpi_inited = false
- ) : _s( process_id ),
- _P( nprocs ), _hostname( hostname ), _port( port )
- {
+ ) : _s( process_id ), _P( nprocs ), _hostname( hostname ), _port( port ) {
// sanity check
if( nprocs == 0 ) {
throw std::invalid_argument( "Total number of user processes must be "
@@ -556,6 +562,11 @@ namespace grb {
U &data_out,
const bool broadcast = false
) const {
+ // check input arguments
+ if( in_size > 0 && data_in == nullptr ) {
+ return ILLEGAL;
+ }
+
// prepare args
lpf_func_t fargs[ 2 ];
lpf_args_t args;
@@ -587,8 +598,9 @@ namespace grb {
/** No implementation notes. */
template< typename T, typename U >
- RC exec( void ( *grb_program )( const T &, U & ), // user GraphBLAS program
- const T &data_in, U &data_out, // input & output data
+ RC exec(
+ void ( *grb_program )( const T &, U & ), // user GraphBLAS program
+ const T &data_in, U &data_out, // input & output data
const bool broadcast = false
) {
// prepare args
diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp
index 5302783e5..3643ca559 100644
--- a/include/graphblas/bsp1d/io.hpp
+++ b/include/graphblas/bsp1d/io.hpp
@@ -504,6 +504,13 @@ namespace grb {
) noexcept {
const size_t n = size( x );
const size_t old_nnz = nnz( x );
+
+ // dynamic checks
+ if( (descr & descriptors::dense) && nnz( x ) < n ) {
+ return ILLEGAL;
+ }
+
+ // capacity check
if( capacity( x ) < n ) {
if( phase == RESIZE ) {
return resize( x, n );
@@ -517,16 +524,20 @@ namespace grb {
}
}
+ // handle trivial resize
assert( capacity( x ) == n );
if( phase == RESIZE ) {
return SUCCESS;
}
+ // dispatch
assert( phase == EXECUTE );
RC ret = internal::set_handle_use_index< descr >( x, old_nnz, val );
if( ret == SUCCESS ) {
internal::setDense( x );
}
+
+ // done
return ret;
}
@@ -622,7 +633,7 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( y ) < size( y ) ) {
+ if( nnz( x ) < size( x ) || nnz( y ) < size( y ) ) {
return ILLEGAL;
}
}
@@ -699,7 +710,10 @@ namespace grb {
return MISMATCH;
}
if( descr & descriptors::dense ) {
- if( nnz( y ) < size( y ) || nnz( mask ) < size( mask ) ) {
+ if( nnz( x ) < size( x ) ||
+ nnz( y ) < size( y ) ||
+ nnz( mask ) < size( mask )
+ ) {
return ILLEGAL;
}
}
@@ -765,11 +779,21 @@ namespace grb {
return MISMATCH;
}
+ // dynamic checks
+ if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+ return ILLEGAL;
+ }
+ if( (descr & descriptors::dense) && nnz( mask ) < size( mask ) ) {
+ return ILLEGAL;
+ }
+
// on capacity pre-check, see above
// all OK, try to do assignment
- RC ret = set< descr >( internal::getLocal( x ),
- internal::getLocal( mask ), y, phase );
+ RC ret = set< descr >(
+ internal::getLocal( x ),
+ internal::getLocal( mask ), y, phase
+ );
if( collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() )
!= SUCCESS
diff --git a/include/graphblas/bsp1d/properties.hpp b/include/graphblas/bsp1d/properties.hpp
index f87cb1c54..8c28386bf 100644
--- a/include/graphblas/bsp1d/properties.hpp
+++ b/include/graphblas/bsp1d/properties.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Gathers the properties of the BSP1D and hybrid backends.
+ *
* @author A. N. Yzelman
* @date 5th of May 2017
*/
@@ -30,11 +34,31 @@ namespace grb {
/** No implementation notes. */
template<>
class Properties< BSP1D > {
- public:
- /** No implementation notes. */
- constexpr static bool writableCaptured = Properties< _GRB_BSP1D_BACKEND >::writableCaptured;
+
+ public:
+
+ /** This property is inherited from the backend it depends on. */
+ static constexpr const bool writableCaptured =
+ Properties< _GRB_BSP1D_BACKEND >::writableCaptured;
+
+ /**
+ * This implementation at present only supports blocking execution.
+ */
+ static constexpr const bool isBlockingExecution = true;
+
+ /**
+ * This implementation at present only supports blocking execution.
+ */
+ static constexpr const bool isNonblockingExecution = false;
+
+ static_assert( Properties< _GRB_BSP1D_BACKEND >::isBlockingExecution,
+ "This implementation assumes blocking behaviour of the underlying "
+ "process-local backend"
+ );
+
};
} // namespace grb
#endif // end ``_H_GRB_BSP1D_PROPERTIES''
+
diff --git a/include/graphblas/bsp1d/vector.hpp b/include/graphblas/bsp1d/vector.hpp
index 465f5a661..1e85db74e 100644
--- a/include/graphblas/bsp1d/vector.hpp
+++ b/include/graphblas/bsp1d/vector.hpp
@@ -618,14 +618,15 @@ namespace grb {
const size_t bufferSize =
internal::Coordinates< _GRB_BSP1D_BACKEND >::bufferSize( _local_n ) +
internal::Coordinates< _GRB_BSP1D_BACKEND >::bufferSize( cap_in );
+ // allocate raw, assigned, and stack arrays
const RC rc = grb::utils::alloc(
"grb::Vector< T, BSP1D, C > (initialize)", sstream.str(),
- _raw, cap_in, true, _raw_deleter, // allocate raw array
+ _raw, cap_in, true, _raw_deleter,
new_assigned,
internal::Coordinates< _GRB_BSP1D_BACKEND >::arraySize( cap_in ),
true,
- _assigned_deleter, // allocate assigned array
- _buffer, bufferSize, true, _buffer_deleter // allocate (stack) buffer
+ _assigned_deleter,
+ _buffer, bufferSize, true, _buffer_deleter
);
// identify error and throw
if( rc == OUTOFMEM ) {
@@ -2439,6 +2440,26 @@ namespace grb {
// done
}
+ /**
+ * Copy-assignment.
+ *
+ * Same performance semantics as #grb::set.
+ *
+ * \warning Errors will be thrown as standard C++ exceptions. Users who rather
+ * not deal with exceptions are encouraged to use #grb::set directly.
+ *
+ * \internal Dispatches to #grb::set.
+ */
+ Vector< D, BSP1D, C > & operator=( Vector< D, BSP1D, C > &x ) {
+ const auto rc = set( *this, x );
+ if( rc != SUCCESS ) {
+ throw std::runtime_error( "grb::set inside copy-constructor: "
+ + toString( rc )
+ );
+ }
+ return *this;
+ }
+
/**
* Assign-from-temporary. This is a \f$ \Theta(1) \f$ operation.
*
diff --git a/include/graphblas/collectives.hpp b/include/graphblas/collectives.hpp
index a99eac739..8ca63fd3e 100644
--- a/include/graphblas/collectives.hpp
+++ b/include/graphblas/collectives.hpp
@@ -28,13 +28,19 @@
// include template specialisations
#ifdef _GRB_WITH_REFERENCE
-#include
+ #include
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/collectives.hpp"
#endif
#ifdef _GRB_WITH_LPF
-#include
+ #include
#endif
#ifdef _GRB_WITH_BANSHEE
-#include
+ #include
#endif
// specify default only if requested during compilation
@@ -46,3 +52,4 @@ namespace grb {
#endif
#endif // end ``_H_GRB_COLL''
+
diff --git a/include/graphblas/config.hpp b/include/graphblas/config.hpp
index 8ef4e6a8d..d7c2a650f 100644
--- a/include/graphblas/config.hpp
+++ b/include/graphblas/config.hpp
@@ -32,6 +32,9 @@
#ifdef _GRB_WITH_HYPERDAGS
#include "graphblas/hyperdags/config.hpp"
#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/config.hpp"
+#endif
#ifdef _GRB_WITH_OMP
#include "graphblas/omp/config.hpp"
#endif
diff --git a/include/graphblas/coordinates.hpp b/include/graphblas/coordinates.hpp
index 3ddf662bb..43f5c9845 100644
--- a/include/graphblas/coordinates.hpp
+++ b/include/graphblas/coordinates.hpp
@@ -27,13 +27,17 @@
// now include all specialisations contained in the backend directories:
#ifdef _GRB_WITH_REFERENCE
-#include
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/coordinates.hpp"
#endif
#ifdef _GRB_WITH_LPF
// #include
#endif
#ifdef _GRB_WITH_BANSHEE
-#include
+ #include
#endif
#endif // _H_GRB_COORDINATES
+
diff --git a/include/graphblas/descriptors.hpp b/include/graphblas/descriptors.hpp
index 1fe3f9836..c90cb5c3d 100644
--- a/include/graphblas/descriptors.hpp
+++ b/include/graphblas/descriptors.hpp
@@ -18,7 +18,7 @@
/**
* @file
*
- * Defines the GraphBLAS various descriptors.
+ * Defines all ALP/GraphBLAS descriptors.
*
* @author A. N. Yzelman
* @date 15 March, 2016
@@ -117,11 +117,11 @@ namespace grb {
static constexpr Descriptor structural_complement = structural | invert_mask;
/**
- * Indicates that all input vectors to an ALP/GraphBLAS primitive are
- * structurally dense.
+ * Indicates that all input and output vectors to an ALP/GraphBLAS primitive
+ * are structurally dense.
*
- * If a user passes this descriptor but one or more vectors input to the call
- * are \em not structurally dense, then #ILLEGAL shall be returned.
+ * If a user passes this descriptor but one or more vectors to the call are
+ * \em not structurally dense, then #ILLEGAL shall be returned.
*
* \warning All vectors includes any vectors that operate as masks.
* Thus if the primitive is to operate with structurally sparse masks
@@ -134,6 +134,10 @@ namespace grb {
* passing this descriptor to such primitive indicates that also the
* output vector is structurally dense.
*
+ * \warning For out-of-place operations with vector output(s), passing this
+ * descriptor also demands that the output vectors are already
+ * dense.
+ *
* \warning Vectors with explicit zeroes (under the semiring passed to the
* related primitive) will be computed with explicitly.
*
@@ -141,6 +145,7 @@ namespace grb {
* 1) less run-time overhead as code handling sparsity is disabled;
* 2) smaller binary sizes as code handling structurally sparse vectors is
* not emitted (unless required elsewhere).
+ *
* The consistent use of this descriptor is hence strongly encouraged.
*/
static constexpr Descriptor dense = 16;
diff --git a/include/graphblas/distribution.hpp b/include/graphblas/distribution.hpp
index 845400337..a382b27ef 100644
--- a/include/graphblas/distribution.hpp
+++ b/include/graphblas/distribution.hpp
@@ -26,10 +26,11 @@
#include "base/distribution.hpp"
#ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/distribution.hpp"
+ #include "graphblas/bsp1d/distribution.hpp"
#endif
#ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/distribution.hpp"
+ #include "graphblas/banshee/distribution.hpp"
#endif
#endif // end `_H_GRB_DISTRIBUTION'
+
diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp
index 22a5bc422..2bcf796aa 100644
--- a/include/graphblas/exec.hpp
+++ b/include/graphblas/exec.hpp
@@ -28,13 +28,19 @@
// include template specialisations
#ifdef _GRB_WITH_REFERENCE
-#include "graphblas/reference/exec.hpp"
+ #include "graphblas/reference/exec.hpp"
+#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/exec.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/exec.hpp"
#endif
#ifdef _GRB_WITH_LPF
-#include "graphblas/bsp1d/exec.hpp"
+ #include "graphblas/bsp1d/exec.hpp"
#endif
#ifdef _GRB_WITH_BANSHEE
-#include "graphblas/banshee/exec.hpp"
+ #include "graphblas/banshee/exec.hpp"
#endif
#ifdef _GRB_BACKEND
@@ -45,3 +51,4 @@ namespace grb {
#endif
#endif // end ``_H_GRB_EXEC''
+
diff --git a/include/graphblas/hyperdags/README.md b/include/graphblas/hyperdags/README.md
new file mode 100644
index 000000000..4ebfe820f
--- /dev/null
+++ b/include/graphblas/hyperdags/README.md
@@ -0,0 +1,304 @@
+
+This backend gathers meta-data while user programs execute. The actual compute
+logic is executed by a compile-time selected secondary backend, which by default
+is the `reference` backend. The meta-data will be used to generate, at program
+exit, a HyperDAG representation of the executed computation. We foresee two
+possible HyperDAG representations:
+
+ 1. a coarse-grain representation where vertices correspond to a) source
+ containers (vectors or matrices-- not scalars), b) output containers, or
+ c) ALP/GraphBLAS primitives (such as grb::mxv or grb::dot). Hyperedges
+ capture which vertices act as source to operations or outputs in other
+ vertices. Each hyperedge has exactly one source vertex only.
+
+ 2. a fine-grain representation where source vertices correspond to nonzeroes
+ in a source container, not the container as a whole, and likewise for output
+ vertices that correspond to individual elements of output containers. Also
+ there are now many fine-grained operation vertices that are executed by a
+ single ALP/GraphBLAS primitive. For example, a call to grb::vxm will emit
+ two hyperedges for every nonzero in the sparse input matrix.
+
+Only the extraction of a coarse-grained representation is presently implemented.
+
+Usage
+=====
+
+To use the HyperDAG generation backend, follow the following steps. Note that
+steps 1-5 are common to building the general ALP/GraphBLAS template library.
+Steps 6 & 7 showcase the HyperDAG generation using representation no. 1 on the
+tests/unit/dot.cpp unit test.
+
+1. `cd /path/to/ALP/GraphBLAS/root/directory`
+
+2. `./configure --prefix=/path/to/install/directory`
+
+3. `cd build`
+
+4. `make -j && make -j install`
+
+5. `source /path/to/install/directory/bin/setenv`
+
+6. `grbcxx -b hyperdags -g -O0 -Wall -o dot_hyperdag ../tests/unit/dot.cpp`
+
+7. `grbrun -b hyperdags ./dot_hyperdag`
+
+After these steps, something like the following will be produced:
+
+```
+This is functional test ./dot_hyperdag
+Info: grb::init (hyperdags) called.
+Info: grb::init (reference) called.
+Info: grb::finalize (hyperdags) called.
+ dumping HyperDAG to stdout
+%%MatrixMarket matrix coordinate pattern general
+% Source vertices:
+% 0: container initialised by a call to set no. 0
+% 1: container initialised by a call to set no. 1
+% 2: input scalar no. 0
+% 6: input scalar no. 1
+...more comment lines follow...
+% 212: input scalar no. 103
+% 213: user-initialised container no. 0
+% 214: user-initialised container no. 1
+214 216 428
+0 2
+0 3
+1 0
+1 3
+2 1
+2 3
+...more pins follow...
+213 214
+213 215
+Info: grb::finalize (reference) called.
+Test OK
+```
+
+This output contains the HyperDAG corresponding to the code in the given source
+file, `tests/unit/dot.cpp`. Let us examine it. First, ALP/GraphBLAS will always
+print info (and warning) statements to the standard error stream. These are:
+
+```
+$ grbrun -b hyperdags ./dot_hyperdag 1> /dev/null
+Info: grb::init (hyperdags) called.
+Info: grb::init (reference) called.
+Info: grb::finalize (hyperdags) called.
+ dumping HyperDAG to stdout
+Info: grb::finalize (reference) called.
+```
+
+These statements indicate which backends are used and when they are
+initialised, respectively, finalised. The info messages indicate that the
+hyperdags backend is used, which, in turn, employs the standard sequential
+reference backend for the actual computations. The second to last message
+reports that as part of finalising the hyperdags backend, it dumps the
+HyperDAG constructed during computations to the stdandard output stream
+(stdout).
+
+The output to stdout starts with
+
+```
+%%MatrixMarket matrix coordinate pattern general
+```
+
+This indicates the HyperDAG is stored using a MatrixMarket format. As the name
+implies, this format stores sparse matrices, so we need a definition of how the
+sparse matrix is mapped back to a HyperDAG. Here, rows correspond to hyperedges
+while columns correspond to vertices.
+
+In the MatrixMarket format, comments are allowed and should start with a `%`.
+The hyperdags backend presently prints which vertices are sources as comment
+lines. Later, also information on the operation and output vertices may be
+added.
+
+After the comments follow the so-called header line:
+
+```
+214 216 428
+```
+
+This indicates that there 214 hyperedges, 216 vertices, and 428 pins in the
+output HyperDAG. What then follows is one line for each of the pins, printed
+as a pair of hypergraph and vertex IDs.
+
+For example, the first two pins contain:
+
+```
+0 2
+0 3
+```
+
+These operate on vertices 2 and 3, which the comments note are an input scalar
+and a non-source vertex, respectively. The corresponding first statements of
+`tests/unit/dot.cpp` are as follows. It stands to reason that vertex 2 thus
+corresponds to the scalar `out` in the below code, while vertex 3 corresponds
+to the scalar output of the `grb::dot`.
+
+```
+ double out = 2.55;
+ grb::Vector< double > left( n );
+ grb::Vector< double > right( n );
+ grb::set( left, 1.5 );
+ grb::set( right, -1.0 );
+ grb::dot( out, left, right, ring );
+```
+
+If this reading is correct, then there should also be two hyperedges connecting
+`left` and `right` to vertex 3, the output of `grb::dot`. Indeed the next four
+pins are
+
+```
+1 0
+1 3
+2 1
+2 3
+```
+
+which indeed correspond to two hyperedges connecting `left` and `right` to the
+output of `grb::dot`. Do note that thus far the HyperDAG is in fact just a DAG,
+given every hyperedge has exectly two pins.
+
+
+Extending the HyperDAGs backend
+===============================
+
+We now briefly visit the implementation of the HyperDAGs backend. The
+implementation of the `hyperdags` `grb::dot` is as follows:
+
+```
+template<
+ Descriptor descr = descriptors::no_operation,
+ class AddMonoid, class AnyOp,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+>
+RC dot( OutputType &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const AddMonoid &addMonoid = AddMonoid(),
+ const AnyOp &anyOp = AnyOp(),
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< AddMonoid >::value &&
+ grb::is_operator< AnyOp >::value,
+ void >::type * const = nullptr
+) {
+...
+```
+
+The signature of the `grb::dot` follows the specification that is found in
+`include/graphblas/reference/blas1.hpp`-- if we need to add a new primitive,
+the first step is to simply copy the signature from the reference backend and
+then change any container template arguments that read `reference` into
+`hyperdags`. This makes sure that the compiler will select the implementation
+we are providing here whenever it needs to generate code for a dot-product using
+the hyperdags backend.
+
+The source file continues:
+```
+ // always force input scalar to be a new source
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &z
+ );
+ ...
+```
+
+Here, we recognise that `z` is an input to the algorithm and needs to be
+registered as a source vertex. Recall that by the `grb::dot` specification,
+`z` is indeed computed in-place: `z += < x, y >`.
+
+The source continues with registering the sources and destinations (outputs) of
+the dot-operation itself:
+
+```
+ std::array< const void *, 1 > sourcesP{ &z };
+ std::array< uintptr_t, 2 > sourcesC{ getID( x ), getID( y ) };
+ std::array< uintptr_t, 1 > destinations{ getID( z ) };
+ ...
+```
+Note that this records auxiliary scalars using pointers, while ALP/GraphBLAS
+containers are registered using its container ID. With that done, we finally
+record the operation, as follows:
+
+```
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::DOT,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ ...
+```
+
+Here, the `addOperation` needs to know the type of operation (`DOT`), what its
+sources are (given here by iterator pairs to the `sources` array), and what its
+destinations are (ditto).
+
+The attentive reader will realise that so far no computation has occurred yet--
+we so far only recorded sources and the intended operation. So we finish up
+with actually performing the requested computation, relying fully on the
+reference backend instead of reimplementing things all over again:
+
+```
+ return dot( z,
+ internal::getVector(x), internal::getVector(y),
+ addMonoid, anyOp
+ );
+}
+```
+
+Here, the `internal::getVector` wrapper function retrieves a reference backend
+version of the input vector, and passes that on to the reference backend.
+
+This quick description ignores phases and error codes -- please see some of the
+actual code snippets in the hyperdags backend for error-safe programming
+patterns.
+
+
+Registering new operation and source types
+==========================================
+
+One may want to register a new type of operation vertex or source vertex. For
+this, see `include/graphblas/hyperdags/hyperdags.hpp` and, in the case of source
+vertices, look for the following enum:
+
+```
+enum SourceVertexType {
+ SCALAR,
+ CONTAINER,
+ ITERATOR,
+ USER_INT
+};
+
+const constexpr size_t numSourceVertexTypes = 4;
+
+const constexpr enum SourceVertexType
+ allSourceVertexTypes[ numSourceVertexTypes ] =
+{
+ SCALAR,
+ CONTAINER,
+ ITERATOR,
+ USER_INT
+};
+```
+
+A new type of source vertex should:
+
+1. be added to the enum. While not copied here, every type is conjoined with
+ documentation describing unambiguously where such a source vertex could
+ come from / how and when they are generated;
+
+2. increment numSourceVertexTypes;
+
+3. add the new enum entry to the allSourceVertexTypes array; and, finally
+
+4. the toString function in `src/graphblas/hyperdags/hyperdags.cpp` should be
+ updated.
+
+To add new operation vertex types, the same recipe should be followed, but then
+using the `OperationVertexType` enum and the `numOperationVertexTypes` counter
+and the `allOperationVertexTypes` array; and similarly for output vertex types.
+
diff --git a/include/graphblas/hyperdags/alloc.hpp b/include/graphblas/hyperdags/alloc.hpp
new file mode 100644
index 000000000..4806f694f
--- /dev/null
+++ b/include/graphblas/hyperdags/alloc.hpp
@@ -0,0 +1,58 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides allocators for the hyperdags backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_ALLOC
+#define _H_GRB_HYPERDAGS_ALLOC
+
+
+namespace grb {
+
+ namespace utils {
+
+ namespace internal {
+
+ template<>
+ class Allocator< hyperdags > {
+
+ private:
+
+ /** Prevent initialisation. */
+ Allocator();
+
+ public:
+
+ /** Refer to the standard allocation mechanism. */
+ typedef AllocatorFunctions< _GRB_WITH_HYPERDAGS_USING > functions;
+ };
+
+ } // namespace internal
+
+ } // namespace utils
+
+} // namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/benchmark.hpp b/include/graphblas/hyperdags/benchmark.hpp
new file mode 100644
index 000000000..23502f33c
--- /dev/null
+++ b/include/graphblas/hyperdags/benchmark.hpp
@@ -0,0 +1,101 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Benchmarker for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 11th of May, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BENCH
+#define _H_GRB_HYPERDAGS_BENCH
+
+#include
+#include
+
+#include "exec.hpp"
+
+
+namespace grb {
+
+ /** \internal Simply wraps around the underlying Benchmarker implementation. */
+ template< enum EXEC_MODE mode >
+ class Benchmarker< mode, hyperdags > :
+ protected Launcher< mode, hyperdags >, protected internal::BenchmarkerBase
+ {
+
+ private:
+
+ typedef Benchmarker< mode, _GRB_WITH_HYPERDAGS_USING > MyBenchmarkerType;
+
+ MyBenchmarkerType benchmarker;
+
+
+ public:
+
+ /** \internal Simple delegation. */
+ Benchmarker(
+ const size_t process_id = 0,
+ const size_t nprocs = 1,
+ const std::string hostname = "localhost",
+ const std::string port = "0"
+ ) :
+ benchmarker( process_id, nprocs, hostname, port )
+ {}
+
+ /** \internal Simple delegation. */
+ template< typename U >
+ RC exec( void ( *grb_program )( const void *, const size_t, U & ),
+ const void * const data_in, const size_t in_size,
+ U &data_out,
+ const size_t inner, const size_t outer,
+ const bool broadcast = false
+ ) const {
+ return benchmarker.exec(
+ grb_program,
+ data_in, in_size,
+ data_out,
+ inner, outer,
+ broadcast
+ );
+ }
+
+ /** \internal Simple delegation. */
+ template< typename T, typename U >
+ RC exec(
+ void ( *grb_program )( const T &, U & ),
+ const T &data_in, U &data_out,
+ const size_t inner, const size_t outer,
+ const bool broadcast = false
+ ) {
+ return benchmarker.exec(
+ grb_program,
+ data_in, data_out,
+ inner, outer,
+ broadcast
+ );
+ }
+
+ };
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_BENCH''
+
diff --git a/include/graphblas/hyperdags/blas1.hpp b/include/graphblas/hyperdags/blas1.hpp
new file mode 100644
index 000000000..623174345
--- /dev/null
+++ b/include/graphblas/hyperdags/blas1.hpp
@@ -0,0 +1,2987 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the "level-1" primitives for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS1
+#define _H_GRB_HYPERDAGS_BLAS1
+
+#include
+
+#include
+
+#include
+
+
+namespace grb {
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AddMonoid, class AnyOp,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC dot(
+ OutputType &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const AddMonoid &addMonoid = AddMonoid(),
+ const AnyOp &anyOp = AnyOp(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< AddMonoid >::value &&
+ grb::is_operator< AnyOp >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = dot< descr >(
+ z, internal::getVector(x), internal::getVector(y),
+ addMonoid, anyOp, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &z
+ );
+ std::array< const void *, 1 > sourcesP{ &z };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 0 > destinations{};
+ // NOTE scalar output is ignored
+ //std::array< const void *, 1 > destinationsP{ &z };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::DOT,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ class Semiring, typename Coords
+ >
+ RC dot(
+ OutputType &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Semiring &ring = Semiring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Semiring >::value,
+ void >::type * const = nullptr
+ ) {
+ // note: dispatches to the above dot-variant, which will handle the HyperDAG
+ // generation.
+ return dot< descr >(
+ z, x, y,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase
+ );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename T, typename U, typename Coords
+ >
+ RC zip(
+ Vector< std::pair< T, U >, hyperdags, Coords > &z,
+ const Vector< T, hyperdags, Coords > &x,
+ const Vector< U, hyperdags, Coords > &y,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< T >::value &&
+ !grb::is_object< U >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = zip< descr >(
+ internal::getVector(z),
+ internal::getVector(x), internal::getVector(y),
+ phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::ZIP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename T, typename U, typename Coords
+ >
+ RC unzip(
+ Vector< T, hyperdags, Coords > &x,
+ Vector< U, hyperdags, Coords > &y,
+ const Vector< std::pair< T, U >, hyperdags, Coords > &in,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< T >::value &&
+ !grb::is_object< U >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = unzip< descr >(
+ internal::getVector(x), internal::getVector(y), internal::getVector(in),
+ phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(in) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(in) )
+ };
+ std::array< uintptr_t, 2 > destinations{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::UNZIP_VECTOR_VECTOR_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z),
+ internal::getVector(x), internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename InputType, typename IOType, typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ IOType &beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+ const RC ret = foldr< descr >( internal::getVector(x), beta, monoid, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 0 > destinations{};
+ // NOTE scalar output is ignored
+ //std::array< const void *, 1 > destinationsP{ &beta };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_SCALAR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename InputType, typename MaskType, typename IOType, typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ IOType &beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldr< descr >( x, beta, monoid, phase );
+ }
+ const RC ret = foldr< descr >(
+ internal::getVector(x), internal::getVector(m),
+ beta, monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) )
+ };
+ std::array< uintptr_t, 0 > destinations{};
+ // NOTE scalar output is ignored
+ // std::array< const void *, 1 > destinationsP{ &beta };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_MASK_SCALAR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename InputType, typename Coords
+ >
+ RC foldr(
+ const InputType &alpha,
+ Vector< IOType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+ const RC ret = foldr< descr >( alpha, internal::getVector(y), monoid, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_APLHA_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, typename IOType, typename InputType, typename Coords
+ >
+ RC foldr(
+ const InputType &alpha,
+ Vector< IOType, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = foldr< descr >( alpha, internal::getVector(y), op, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_APLHA_VECTOR_OPERATOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename IOType, typename InputType, typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ Vector< IOType, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value,
+ void >::type * = nullptr
+ ) {
+ const RC ret = foldr< descr >(
+ internal::getVector(x),
+ internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_VECTOR_OPERATOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename IOType, typename MaskType, typename InputType, typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ Vector< IOType, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< IOType >::value,
+ void >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldr< descr >( x, y, op, phase );
+ }
+ const RC ret = foldr< descr >(
+ internal::getVector(x),
+ internal::getVector(m),
+ internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, typename IOType, typename InputType, typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ Vector< IOType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< Monoid >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value,
+ void >::type * = nullptr
+ ) {
+ const RC ret = foldr< descr >(
+ internal::getVector(x), internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ Vector< IOType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< Monoid >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value,
+ void >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldr< descr >( x, y, monoid, phase );
+ }
+ const RC ret = foldr< descr >(
+ internal::getVector(x), internal::getVector(m),
+ internal::getVector(y), monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(y) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename InputType, typename IOType, typename Coords
+ >
+ RC foldl(
+ IOType &x,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = foldl< descr >(
+ x, internal::getVector(y), monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &x
+ );
+ std::array< const void *, 1 > sourcesP{ &x };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(y) ) };
+ std::array< uintptr_t, 0 > destinations{};
+ // NOTE scalar outputs are ignored
+ //std::array< const void *, 1 > destinationsP{ &x };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_SCALAR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename InputType, typename IOType, typename MaskType,
+ typename Coords
+ >
+ RC foldl(
+ IOType &x,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return foldl< descr >( x, y, monoid, phase );
+ }
+ const RC ret = foldl< descr >(
+ x, internal::getVector(y), internal::getVector(mask),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &x
+ );
+ std::array< const void *, 1 > sourcesP{ &x };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(mask) )
+ };
+ std::array< uintptr_t, 0 > destinations{};
+ // NOTE scalar outputs are ignored
+ // std::array< const void * const, 1 > destinationsP{ &x };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_SCALAR_VECTOR_MASK_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Op, typename IOType, typename InputType, typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const InputType beta,
+ const Op &op = Op(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_operator< Op >::value,
+ void >::type * = nullptr
+ ) {
+ const RC ret = foldl< descr >( internal::getVector(x), beta, op, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Op,
+ typename IOType, typename MaskType, typename InputType, typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType beta,
+ const Op &op = Op(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_operator< Op >::value,
+ void >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldl< descr >( x, beta, op, phase );
+ }
+ const RC ret = foldl< descr >(
+ internal::getVector(x), internal::getVector(m),
+ beta, op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename InputType, typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const InputType beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * = nullptr
+ ) {
+ const RC ret = foldl< descr >( internal::getVector(x), beta, monoid, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType &beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldl< descr >( x, beta, monoid, phase );
+ }
+ const RC ret = foldl< descr >(
+ internal::getVector(x), internal::getVector(m),
+ beta, monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template <
+ Descriptor descr = descriptors::no_operation,
+ class Monoid, typename IOType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< Monoid >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value,
+ void >::type * = nullptr
+ ) {
+ const RC ret = foldl< descr >(
+ internal::getVector(x), internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template <
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename IOType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldl< descr >( x, y, op, phase );
+ }
+ const RC ret = foldl< descr >(
+ internal::getVector(x), internal::getVector(m),
+ internal::getVector(y), op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename IOType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< Monoid >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value,
+ void >::type * = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return foldl< descr >( x, y, monoid, phase );
+ }
+ const RC ret = foldl< descr >(
+ internal::getVector(x),internal::getVector(m),
+ internal::getVector(y), monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP, typename IOType, typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, hyperdags, Coords > &x,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value,
+ void >::type * = nullptr
+ ) {
+ const RC ret = foldl< descr >(
+ internal::getVector(x), internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::FOLDL_VECTOR_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template< typename Func, typename DataType, typename Coords >
+ RC eWiseLambda(
+ const Func f, const Vector< DataType, hyperdags, Coords > &x
+ ) {
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISELAMBDA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return eWiseLambda( f, internal::getVector(x) );
+ }
+
+ namespace internal {
+
+ /** \internal This is the end recursion */
+ template<
+ typename Func, typename DataType,
+ typename Coords
+ >
+ RC hyperdag_ewisevector(
+ const Func f,
+ const Vector< DataType, grb::hyperdags, Coords > &x,
+ std::vector< uintptr_t > &sources,
+ std::vector< uintptr_t > &destinations
+ ) {
+ const RC ret = grb::eWiseLambda( f, internal::getVector(x) );
+ if( ret != grb::SUCCESS ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ sources.push_back( getID( internal::getVector(x) ) );
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISELAMBDA_FUNC_VECTOR,
+ sourcesP.cbegin(), sourcesP.cend(),
+ sources.cbegin(), sources.cend(),
+ destinations.cbegin(), destinations.cend()
+ );
+ return ret;
+ }
+
+ /** \internal This is the base recursion */
+ template<
+ typename Func, typename DataType1, typename DataType2,
+ typename Coords, typename... Args
+ >
+ RC hyperdag_ewisevector(
+ const Func f,
+ const Vector< DataType1, grb::hyperdags, Coords > &x,
+ std::vector< uintptr_t > &sources,
+ std::vector< uintptr_t > &destinations,
+ const Vector< DataType2, grb::hyperdags, Coords > &y,
+ Args... args
+ ) {
+ sources.push_back( getID( internal::getVector(y) ) );
+ destinations.push_back( getID( internal::getVector(y) ) );
+ return hyperdag_ewisevector( f, x, sources, destinations, args... );
+ }
+
+ } // end namespace grb::internal
+
+ template<
+ typename Func,
+ typename DataType1, typename DataType2, typename Coords,
+ typename... Args
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Vector< DataType1, hyperdags, Coords > &x,
+ const Vector< DataType2, hyperdags, Coords > &y,
+ Args const &... args
+ ) {
+ std::vector< uintptr_t > sources, destinations;
+ return internal::hyperdag_ewisevector(
+ f, x, sources, destinations, y, args...
+ );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), alpha, beta,
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(z) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 1 > sourcesC{
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), alpha, beta,
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 1 > sourcesC{
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, beta, op, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ alpha, beta,
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(z) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 1 > sourcesC{
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{
+ getID( internal::getVector(z) )
+ };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, beta, monoid, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ alpha, beta,
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value
+ && !grb::is_object< InputType1 >::value
+ && !grb::is_object< InputType2 >::value
+ && grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(x), beta,
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP, typename OutputType,
+ typename InputType1, typename InputType2, typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value
+ && grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), alpha, internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(z) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, x, beta, monoid, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ internal::getVector(x), beta,
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename MaskType, typename InputType1,
+ typename InputType2, typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, x, beta, op, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ internal::getVector(x), beta,
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, y, monoid, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ alpha, internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename MaskType, typename InputType1,
+ typename InputType2, typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, y, op, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ alpha, internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class OP,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, x, y, op, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ internal::getVector(x), internal::getVector(y),
+ op, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z),
+ internal::getVector(x), beta,
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z),
+ alpha, internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2, typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return eWiseApply< descr >( z, x, y, monoid, phase );
+ }
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z), internal::getVector(mask),
+ internal::getVector(x), internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Monoid,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getVector(z),
+ internal::getVector(x), internal::getVector(y),
+ monoid, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ internal::getVector(a), internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 5 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, x, gamma, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ internal::getVector(a), internal::getVector(x), gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 1 > sourcesP{ &gamma };
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), alpha,
+ internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring, typename InputType1,
+ typename InputType2, typename InputType3, typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const InputType2 chi,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ internal::getVector(a), chi, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &chi
+ );
+ std::array< const void *, 1 > sourcesP{ &chi };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_CHI,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const InputType2 chi,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ internal::getVector(a), chi, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &chi
+ );
+ std::array< const void *, 1 > sourcesP{ &chi };
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, beta, gamma, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ internal::getVector(a), beta, gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 2 > sourcesP{ &beta, &gamma };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_BETA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename MaskType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, alpha, x, gamma, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, internal::getVector(x), gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &gamma };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, alpha, beta, y, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, beta, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename OutputType, typename MaskType, typename InputType1,
+ typename InputType2, typename InputType3, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMulAdd< descr >( z, alpha, beta, gamma, ring, phase );
+ }
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, beta, gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 3 > sourcesP{ &alpha, &beta, &gamma };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ internal::getVector(a), internal::getVector(x), gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 1 > sourcesP{ &gamma };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ internal::getVector(a), beta, gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(z) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 2 > sourcesP{ &beta, &gamma };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ alpha, internal::getVector(x), gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &gamma };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename InputType3, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ alpha, beta, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename InputType3, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ alpha, beta, gamma,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(z) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &gamma
+ );
+ std::array< const void *, 3 > sourcesP{ &alpha, &beta, &gamma };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(z) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \warning This function is deprecated */
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename InputType3,
+ typename OutputType, typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &a,
+ const Vector< InputType2, hyperdags, Coords > &x,
+ const Vector< InputType3, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMulAdd< descr >(
+ internal::getVector(z),
+ internal::getVector(a), internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(a) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z), internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z),
+ alpha, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(y) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z),
+ internal::getVector(x), beta,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( internal::getVector(x) ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_BETA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z),
+ alpha, beta,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 1 > sourcesC{
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_ALPHA_BETA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename MaskType, typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMul< descr >( z, x, y, ring, phase );
+ }
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z),
+ internal::getVector(m), internal::getVector(x), internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename MaskType, typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, y, ring, phase );
+ }
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, internal::getVector(y),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ std::array< const void *, 1 > sourcesP{ &alpha };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename MaskType, typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMul< descr >( z, x, beta, ring, phase );
+ }
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z), internal::getVector(m),
+ internal::getVector(x), beta,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 1 > sourcesP{ &beta };
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename MaskType, typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, hyperdags, Coords > &z,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(m) ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, beta, ring, phase );
+ }
+ const RC ret = eWiseMul< descr >(
+ internal::getVector(z), internal::getVector(m),
+ alpha, beta,
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &alpha
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &beta
+ );
+ std::array< const void *, 2 > sourcesP{ &alpha, &beta };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(m) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(z) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/blas2.hpp b/include/graphblas/hyperdags/blas2.hpp
new file mode 100644
index 000000000..e2e781be2
--- /dev/null
+++ b/include/graphblas/hyperdags/blas2.hpp
@@ -0,0 +1,687 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the BLAS-2 API for the hypergraphs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS2
+#define _H_GRB_HYPERDAGS_BLAS2
+
+#include
+
+#include
+
+#include
+
+
+namespace grb {
+
+ template<
+ Descriptor descr = descriptors::no_operation, class Ring,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Matrix< InputType2, hyperdags > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return vxm< descr >( u, v, A, ring, phase );
+ }
+ const RC ret = vxm< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getVector(v), internal::getMatrix(A),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid, class MultiplicativeOperator,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Matrix< InputType2, hyperdags > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return vxm< descr >( u, v, A, add, mul, phase );
+ }
+ const RC ret = vxm< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getVector(v), internal::getMatrix(A),
+ add, mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Matrix< InputType2, hyperdags > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = vxm< descr >(
+ internal::getVector(u),
+ internal::getVector(v), internal::getMatrix(A),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_VECTOR_VECTOR_MATRIX_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2,
+ typename InputType3 = bool,
+ typename Coords
+ >
+ RC mxv(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Matrix< InputType2, hyperdags > &A,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(mask) ) == 0 ) {
+ return mxv< descr >( u, A, v, ring, phase );
+ }
+ const RC ret = mxv< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getMatrix(A), internal::getVector(v),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class Ring,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4, typename Coords
+ >
+ RC mxv(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Matrix< InputType2, hyperdags > &A,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Vector< InputType4, hyperdags, Coords > &v_mask,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(v_mask) ) == 0 ) {
+ return mxv< descr >( u, mask, A, v, ring, phase );
+ }
+ const RC ret = mxv< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getMatrix(A), internal::getVector(v), internal::getVector(v_mask),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::vector< uintptr_t > sourcesC{
+ getID( internal::getVector(v_mask) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getVector(u) )
+ };
+ if( size( internal::getVector(mask) ) > 0 ) {
+ sourcesC.push_back( getID( internal::getVector(mask) ) );
+ }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class AdditiveMonoid, class MultiplicativeOperator,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4, typename Coords
+ >
+ RC mxv(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Matrix< InputType2, hyperdags > &A,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Vector< InputType4, hyperdags, Coords > &v_mask,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(v_mask) ) == 0 ) {
+ return mxv< descr >( u, mask, A, v, add, mul, phase );
+ }
+ const RC ret = mxv< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getMatrix(A), internal::getVector(v), internal::getVector(v_mask),
+ add, mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::vector< uintptr_t > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getVector(v_mask) ),
+ getID( internal::getVector(u) )
+ };
+ if( size( internal::getVector(mask) ) > 0 ) {
+ sourcesC.push_back( getID( internal::getVector(mask) ) );
+ }
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2,
+ typename Coords
+ >
+ RC mxv(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Matrix< InputType2, hyperdags > &A,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = mxv< descr >(
+ internal::getVector(u),
+ internal::getMatrix(A), internal::getVector(v),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXV_VECTOR_MATRIX_VECTOR_RING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid, class MultiplicativeOperator,
+ typename IOType, typename InputType1, typename InputType2, typename Coords
+ >
+ RC mxv(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Matrix< InputType2, hyperdags > &A,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = mxv< descr >(
+ internal::getVector(u),
+ internal::getMatrix(A), internal::getVector(v),
+ add, mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ /** \internal Uses a direct implementation. */
+ template<
+ typename Func, typename DataType
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType, hyperdags > &A
+ ) {
+ const RC ret = eWiseLambda( f, internal::getMatrix(A) );
+ if( ret != SUCCESS ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+ std::array< uintptr_t, 0 > destinations{};
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISELAMBDA_FUNC_MATRIX,
+ sourcesP.cbegin(), sourcesP.cend(),
+ sourcesC.cbegin(), sourcesC.cend(),
+ destinations.cbegin(), destinations.cend()
+ );
+ return ret;
+ }
+
+ namespace internal {
+
+ /** \internal This is the end recursion */
+ template<
+ typename Func, typename DataType
+ >
+ RC hyperdag_ewisematrix(
+ const Func f,
+ const Matrix< DataType, grb::hyperdags > &A,
+ std::vector< uintptr_t > &sources,
+ std::vector< uintptr_t > &destinations
+ ) {
+ const RC ret = grb::eWiseLambda( f, internal::getMatrix(A) );
+ if( ret != SUCCESS ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ sources.push_back( getID( internal::getMatrix(A) ) );
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISELAMBDA_FUNC_MATRIX,
+ sourcesP.cbegin(), sourcesP.cend(),
+ sources.cbegin(), sources.cend(),
+ destinations.cbegin(), destinations.cend()
+ );
+ return ret;
+ }
+
+ /** \internal This is the base recursion */
+ template<
+ typename Func, typename DataType1, typename DataType2,
+ typename Coords, typename... Args
+ >
+ RC hyperdag_ewisematrix(
+ const Func f,
+ const Matrix< DataType1, grb::hyperdags > &A,
+ std::vector< uintptr_t > &sources,
+ std::vector< uintptr_t > &destinations,
+ const Vector< DataType2, grb::hyperdags, Coords > &x,
+ Args... args
+ ) {
+ sources.push_back( getID( internal::getVector(x) ) );
+ destinations.push_back( getID( internal::getVector(x) ) );
+ return hyperdag_ewisematrix( f, A, sources, destinations, args... );
+ }
+
+ } // end namespace grb::internal
+
+ /** \internal Implements the recursive case */
+ template<
+ typename Func,
+ typename DataType1, typename DataType2,
+ typename Coords, typename... Args
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType1, hyperdags > &A,
+ const Vector< DataType2, hyperdags, Coords > &x,
+ Args... args
+ ) {
+ std::vector< uintptr_t > sources, destinations;
+ return internal::hyperdag_ewisematrix(
+ f, A, sources, destinations, x, args...
+ );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class Ring,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4, typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Vector< InputType4, hyperdags, Coords > &v_mask,
+ const Matrix< InputType2, hyperdags > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(v_mask) ) == 0 ) {
+ return vxm< descr >( u, mask, v, A, ring, phase );
+ }
+ const RC ret = vxm< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getVector(v), internal::getVector(v_mask), internal::getMatrix(A),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::vector< uintptr_t > sourcesC{
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v_mask) ),
+ getID( internal::getVector(u) )
+ };
+ if( size( internal::getVector(mask) ) > 0 ) {
+ sourcesC.push_back( getID( internal::getVector(mask) ) );
+ }
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class AdditiveMonoid, class MultiplicativeOperator,
+ typename IOType, typename InputType1, typename InputType2,
+ typename InputType3, typename InputType4, typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType3, hyperdags, Coords > &mask,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Vector< InputType4, hyperdags, Coords > &v_mask,
+ const Matrix< InputType2, hyperdags > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( internal::getVector(v_mask) ) == 0 ) {
+ return vxm< descr >( u, mask, v, A, add, mul, phase );
+ }
+ const RC ret = vxm< descr >(
+ internal::getVector(u), internal::getVector(mask),
+ internal::getVector(v), internal::getVector(v_mask), internal::getMatrix(A),
+ add, mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::vector< uintptr_t > sourcesC{
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(v_mask) ),
+ getID( internal::getVector(u) )
+ };
+ if( size( internal::getVector(mask) ) == 0 ) {
+ sourcesC.push_back( getID( internal::getVector(mask) ) );
+ }
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid, class MultiplicativeOperator,
+ typename IOType, typename InputType1, typename InputType2, typename Coords
+ >
+ RC vxm(
+ Vector< IOType, hyperdags, Coords > &u,
+ const Vector< InputType1, hyperdags, Coords > &v,
+ const Matrix< InputType2, hyperdags > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = vxm< descr >(
+ internal::getVector(u),
+ internal::getVector(v), internal::getMatrix(A),
+ add, mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(u) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(u) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/blas3.hpp b/include/graphblas/hyperdags/blas3.hpp
new file mode 100644
index 000000000..9448f5f57
--- /dev/null
+++ b/include/graphblas/hyperdags/blas3.hpp
@@ -0,0 +1,334 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the BLAS-3 API for the hypergraphs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_BLAS3
+#define _H_GRB_HYPERDAGS_BLAS3
+
+#include
+#include
+
+#include
+
+#include
+
+
+namespace grb {
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT,
+ class MulMonoid
+ >
+ RC eWiseApply(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, hyperdags > &A,
+ const Matrix< InputType2, hyperdags > &B,
+ const MulMonoid &mulmono,
+ const Phase phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< MulMonoid >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getMatrix( C ),
+ internal::getMatrix( A ), internal::getMatrix( B ),
+ mulmono, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(B) ),
+ getID( internal::getMatrix(C) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = grb::descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT,
+ class Operator
+ >
+ RC eWiseApply(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+ const Operator &mulOp,
+ const Phase phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = eWiseApply< descr >(
+ internal::getMatrix( C ),
+ internal::getMatrix( A ), internal::getMatrix( B ),
+ mulOp, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(B) ),
+ getID( internal::getMatrix(C) ),
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation, typename OutputType,
+ typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT,
+ class Semiring
+ >
+ RC mxm(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+ const Semiring &ring = Semiring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Semiring >::value, void
+ >::type * const = nullptr
+ ) {
+ const RC ret = mxm< descr >( internal::getMatrix( C ),
+ internal::getMatrix( A ), internal::getMatrix( B ),
+ ring, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(B) ),
+ getID( internal::getMatrix(C) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = grb::descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT,
+ class Operator, class Monoid
+ >
+ RC mxm(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, hyperdags, RIT, CIT, NIT > &B,
+ const Monoid &addM,
+ const Operator &mulOp,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+ const RC ret = mxm< descr >(
+ internal::getMatrix( C ),
+ internal::getMatrix( A ), internal::getMatrix( B ),
+ addM, mulOp, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(B) ),
+ getID( internal::getMatrix(C) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::MXM_MATRIX_MATRIX_MATRIX_MONOID,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType1, typename InputType2, typename OutputType,
+ typename RIT, typename CIT, typename NIT,
+ typename Coords, class Operator
+ >
+ RC outer(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &A,
+ const Vector< InputType1, hyperdags, Coords > &u,
+ const Vector< InputType2, hyperdags, Coords > &v,
+ const Operator &mul = Operator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< Operator >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< OutputType >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = outer< descr >(
+ internal::getMatrix( A ),
+ internal::getVector( u ), internal::getVector( v ),
+ mul, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(u) ),
+ getID( internal::getVector(v) ),
+ getID( internal::getMatrix(A) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::OUTER,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename InputType3, typename RIT, typename CIT, typename NIT,
+ typename Coords
+ >
+ RC zip(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &A,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Vector< InputType3, hyperdags, Coords > &z,
+ const Phase &phase = EXECUTE
+ ) {
+ const RC ret = zip< descr >(
+ internal::getMatrix( A ),
+ internal::getVector( x ), internal::getVector( y ),
+ internal::getVector( z ),
+ phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 4 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(z) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT,
+ typename Coords
+ >
+ RC zip(
+ Matrix< void, hyperdags, RIT, CIT, NIT > &A,
+ const Vector< InputType1, hyperdags, Coords > &x,
+ const Vector< InputType2, hyperdags, Coords > &y,
+ const Phase &phase = EXECUTE
+ ) {
+ const RC ret = zip< descr >(
+ internal::getMatrix( A ),
+ internal::getVector( x ), internal::getVector( y ),
+ phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(y) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::ZIP_MATRIX_VECTOR_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/collectives.hpp b/include/graphblas/hyperdags/collectives.hpp
new file mode 100644
index 000000000..6102db382
--- /dev/null
+++ b/include/graphblas/hyperdags/collectives.hpp
@@ -0,0 +1,128 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the collectives API for the HyperDAGs backend
+ *
+ * Copies the reference implementation
+ *
+ * @author A. N. Yzelman & J. M. Nash
+ * @date 12th of April, 2017
+ */
+
+#ifndef _H_GRB_HYPERDAGS_COLL
+#define _H_GRB_HYPERDAGS_COLL
+
+#include
+
+#include
+
+#define NO_CAST_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template parameters " \
+ "in this call to " y ".\n" \
+ "* Possible fix 2 | Provide a value of the same type as the first " \
+ "domain of the given operator.\n" \
+ "* Possible fix 3 | Ensure the operator given to this call to " y " h" \
+ "as all of its domains equal to each other.\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" );
+
+
+namespace grb {
+
+ template<>
+ class collectives< hyperdags > {
+
+ private:
+
+ /** Disallow instantiation of this class. */
+ collectives() {}
+
+ public:
+
+ /**
+ * Implementation details: the reference implementation has a single user
+ * process, so this call is a no-op.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator, typename IOType
+ >
+ static RC allreduce(
+ IOType &inout, const Operator op = Operator()
+ ) {
+ return grb::collectives::allreduce(
+ inout, op
+ );
+ }
+
+ /**
+ * Implementation details: the reference implementation has a single user
+ * process, so this call is a no-op.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator, typename IOType
+ >
+ static RC reduce(
+ IOType &inout, const size_t root = 0, const Operator op = Operator()
+ ) {
+ // static checks
+ return grb::collectives< grb::_GRB_WITH_HYPERDAGS_USING >::reduce(
+ inout, root, op
+ );
+ }
+
+ /**
+ * Implementation details: the reference implementation has a single user
+ * process, so this call is a no-op.
+ */
+ template< typename IOType >
+ static RC broadcast( IOType &inout, const size_t root = 0 ) {
+ return grb::collectives::broadcast(
+ inout, root
+ );
+ }
+
+ /** Implementation details: in a single user processes, this is a no-op. */
+ template< Descriptor descr = descriptors::no_operation, typename IOType >
+ static RC broadcast(
+ IOType * inout, const size_t size, const size_t root = 0
+ ) {
+ return grb::collectives::broadcast(
+ inout, size, root
+ );
+ }
+
+ }; // end class `collectives< hyperdags >'
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_COLL''
+
diff --git a/include/graphblas/hyperdags/config.hpp b/include/graphblas/hyperdags/config.hpp
new file mode 100644
index 000000000..dbf0cc1ca
--- /dev/null
+++ b/include/graphblas/hyperdags/config.hpp
@@ -0,0 +1,107 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Contains the configuration parameters for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January 2022.
+ */
+
+#ifndef _H_GRB_HYPERDAGS_CONFIG
+#define _H_GRB_HYPERDAGS_CONFIG
+
+#include
+
+#ifndef _GRB_WITH_HYPERDAGS_USING
+ #error "_GRB_WITH_HYPERDAGS_USING must be defined"
+#endif
+
+
+namespace grb {
+
+ namespace config {
+
+ /**
+ * The implementation details of the #grb::hyperdag backend.
+ *
+ * Since the HyperDAGs backend simply intercepts primitive calls and relies
+ * on a second backend for its functional execution, this class simply
+ * delegates all fields to that underlying backend.
+ *
+ * \note The user documentation only specifies the fields that under some
+ * circumstances may benefit from a user adapting it. For viewing all
+ * fields, please see the developer documentation.
+ *
+ * \note Adapting the fields should be done with care and may require
+ * re-compilation and re-installation of the ALP framework.
+ */
+ template<>
+ class IMPLEMENTATION< hyperdags > {
+
+ public:
+
+ /**
+ * @returns The default allocation policy for private memory regions of the
+ * underlying backend.
+ */
+ static constexpr ALLOC_MODE defaultAllocMode() {
+ return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::defaultAllocMode();
+ }
+
+ /**
+ * @returns The default allocation policy for shared memory regions of the
+ * underlying backend.
+ */
+ static constexpr ALLOC_MODE sharedAllocMode() {
+ return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::sharedAllocMode();
+ }
+
+ /**
+ * \internal
+ * @returns The default vector coordinates instance of the underlying
+ * backend.
+ *
+ * \note This is an extension for compatability with the reference and BSP1D
+ * backends.
+ * \endinternal
+ */
+ static constexpr Backend coordinatesBackend() {
+ return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::coordinatesBackend();
+ }
+
+ /**
+ * \internal
+ * @returns The fixed vector capacity property of the underlying
+ * implementation.
+ * \endinternal
+ */
+ static constexpr bool fixedVectorCapacities() {
+ return IMPLEMENTATION< _GRB_WITH_HYPERDAGS_USING >::
+ fixedVectorCapacities();
+ }
+
+ };
+
+ }
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/exec.hpp b/include/graphblas/hyperdags/exec.hpp
new file mode 100644
index 000000000..376e78b5b
--- /dev/null
+++ b/include/graphblas/hyperdags/exec.hpp
@@ -0,0 +1,104 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the Launcher for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_EXEC
+#define _H_GRB_HYPERDAGS_EXEC
+
+#include
+#include
+
+
+namespace grb {
+
+ /**
+ * No implementation notes.
+ */
+ template< EXEC_MODE mode >
+ class Launcher< mode, hyperdags > {
+
+ private:
+
+ /**
+ * Rely on underlying backend.
+ */
+ typedef Launcher< mode, _GRB_WITH_HYPERDAGS_USING > MyLauncherType;
+
+ /**
+ * Instantiate the sub-backend.
+ */
+ MyLauncherType launcher;
+
+
+ public:
+
+ /**
+ * Default constructor.
+ *
+ * Simply calls that of the underlying constructor.
+ */
+ Launcher(
+ const size_t process_id = 0, const size_t nprocs = 1,
+ const std::string hostname = "localhost",
+ const std::string port = "0"
+ ) : launcher( process_id, nprocs, hostname, port ) {}
+
+ /**
+ * Variable input-size execution.
+ *
+ * Simply calls underlying launcher.
+ */
+ template< typename U >
+ RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
+ const void * data_in,
+ const size_t in_size,
+ U &data_out,
+ const bool broadcast = false
+ ) {
+ return launcher.exec( grb_program, data_in, in_size, data_out, broadcast );
+ }
+
+ /**
+ * Fixed-size execution.
+ *
+ * Simply calls underlying launcher.
+ */
+ template< typename T, typename U >
+ RC exec(
+ void ( *grb_program )( const T &, U & ),
+ const T &data_in,
+ U &data_out,
+ const bool broadcast = false
+ ) {
+ return launcher.exec( grb_program, data_in, data_out, broadcast );
+ }
+
+ };
+
+}
+
+#endif
+
diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp
new file mode 100644
index 000000000..4ef0e0059
--- /dev/null
+++ b/include/graphblas/hyperdags/hyperdags.hpp
@@ -0,0 +1,1305 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides mechanisms to track HyperDAG representations of ALP programs
+ *
+ * @author A. N. Yzelman
+ * @date 1st of February, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_STATE
+#define _H_GRB_HYPERDAGS_STATE
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+
+namespace grb {
+
+ namespace internal {
+
+ namespace hyperdags {
+
+ /** \internal The three vertex types in a HyperDAG */
+ enum VertexType {
+ SOURCE,
+ OPERATION,
+ OUTPUT
+ };
+
+ // 1: all source vertex definitions
+
+ /** \internal The types of source vertices that may be generated. */
+ enum SourceVertexType {
+
+ /**
+ * \internal Scalars are always handled as a new source. We do not track
+ * whether the same scalars are re-used, because we cannot reliably do so
+ * due to a lack of an grb::Scalar.
+ */
+ SCALAR,
+
+ /**
+ * \internal The source is a container managed by ALP.
+ */
+ CONTAINER,
+
+ /**
+ * \internal The source is an iterator passed to ALP.
+ */
+ ITERATOR,
+
+ /**
+ * \internal The source is a user integer passed to ALP, usually signifying
+ * an index or a size.
+ */
+ USER_INT
+
+ };
+
+ /** \internal The number of source vertex types. */
+ const constexpr size_t numSourceVertexTypes = 4;
+
+ /** \internal An array of all source vertex types. */
+ const constexpr enum SourceVertexType
+ allSourceVertexTypes[ numSourceVertexTypes ] =
+ {
+ SCALAR,
+ CONTAINER,
+ ITERATOR,
+ USER_INT
+ };
+
+ /** \internal @returns The type, as a string, of a source vertex. */
+ std::string toString( const enum SourceVertexType type ) noexcept;
+
+ /** \internal A source vertex. */
+ class SourceVertex {
+
+ private:
+
+ /** \internal The type of source */
+ enum SourceVertexType type;
+
+ /** \internal The ID amongst vertices of the same type */
+ size_t local_id;
+
+ /** \internal The global ID of the vertex */
+ size_t global_id;
+
+
+ public:
+
+ /**
+ * \internal The default source vertex constructor.
+ *
+ * @param[in] type The type of the vertex.
+ * @param[in] lid The ID of vertices of the same type.
+ * @param[in] gid The global ID of the vertex.
+ */
+ SourceVertex(
+ const enum SourceVertexType type,
+ const size_t lid, const size_t gid
+ ) noexcept;
+
+ /** \internal @returns The vertex type. */
+ enum SourceVertexType getType() const noexcept;
+
+ /** \internal @returns The type ID. */
+ size_t getLocalID() const noexcept;
+
+ /** \internal @returns The global ID. */
+ size_t getGlobalID() const noexcept;
+
+ };
+
+ /** \internal Helps create a new source vertex */
+ class SourceVertexGenerator {
+
+ private:
+
+ /** \internal Map of next local IDs. */
+ std::map< enum SourceVertexType, size_t > nextID;
+
+
+ public:
+
+ /** \internal Default constructor. */
+ SourceVertexGenerator();
+
+ /**
+ * \internal
+ *
+ * @param[in] type the type of source vertex
+ * @param[in] id a unique global ID
+ *
+ * @returns a new source vertex with an unique local ID
+ *
+ * \endinternal
+ */
+ SourceVertex create( const SourceVertexType type, const size_t id );
+
+ /**
+ * \internal
+ *
+ * @returns The total number of source vertex generated of any type.
+ *
+ * \endinternal
+ */
+ size_t size() const;
+
+ };
+
+ // 2: everything related to output vertices
+
+ /** \internal The types of output vertices that may be generated. */
+ enum OutputVertexType {
+
+ /**
+ * \internal The output is an ALP container.
+ */
+ CONTAINER_OUTPUT
+
+ };
+
+ /** \internal The number of distinct output vertex types. */
+ const constexpr size_t numOutputVertexTypes = 1;
+
+ /** \internal An array of output vertex types. */
+ const constexpr enum OutputVertexType
+ allOutputVertexTypes[ numOutputVertexTypes ] =
+ {
+ CONTAINER_OUTPUT
+ };
+
+ /** \internal @returns A string form of a given output vertex type. */
+ std::string toString( const enum OutputVertexType type ) noexcept;
+
+ /** \internal An output vertex. */
+ class OutputVertex {
+
+ private:
+
+ /** \internal The type of the output */
+ enum OutputVertexType type;
+
+ /** \internal The output vertex ID */
+ const size_t local_id;
+
+ /** \internal The global vertex ID */
+ const size_t global_id;
+
+
+ public:
+
+ /**
+ * \internal Default constructor.
+ *
+ * @param[in] lid The ID within vertices of this type.
+ * @param[in] gid The global vertex ID.
+ *
+ * Recall there is only one output vertex type, hence the precise type is
+ * not a constructor argument.
+ */
+ OutputVertex( const size_t lid, const size_t gid ) noexcept;
+
+ /** \internal @returns The type of this output vertex. */
+ enum OutputVertexType getType() const noexcept;
+
+ /** \internal @returns The ID amongst vertices of the same type. */
+ size_t getLocalID() const noexcept;
+
+ /** \internal @returns The ID amongst all vertices. */
+ size_t getGlobalID() const noexcept;
+
+ };
+
+ /** \internal Helps create output vertices. */
+ class OutputVertexGenerator {
+
+ private:
+
+ /** \internal Keeps track of the next output vertex ID. */
+ size_t nextID;
+
+
+ public:
+
+ /** \internal Default constructor. */
+ OutputVertexGenerator() noexcept;
+
+ /**
+ * \internal
+ *
+ * @param[in] id a unique global ID
+ *
+ * @returns a new output vertex with an unique local ID
+ *
+ * \endinternal
+ */
+ OutputVertex create( const size_t id );
+
+ /**
+ * \internal
+ *
+ * @returns The total number of output vertices generated.
+ *
+ * \endinternal
+ */
+ size_t size() const noexcept;
+
+ };
+
+ // 3: everything related to operation vertices
+
+ /** \internal Which operation an OperationVertex encodes. */
+ enum OperationVertexType {
+
+ NNZ_VECTOR,
+
+ NNZ_MATRIX,
+
+ CLEAR_VECTOR,
+
+ SET_VECTOR_ELEMENT,
+
+ DOT,
+
+ SET_USING_VALUE,
+
+ SET_USING_MASK_AND_VECTOR,
+
+ SET_USING_MASK_AND_SCALAR,
+
+ SET_FROM_VECTOR,
+
+ ZIP,
+
+ E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+
+ FOLDR_VECTOR_SCALAR_MONOID,
+
+ FOLDR_VECTOR_MASK_SCALAR_MONOID,
+
+ FOLDL_SCALAR_VECTOR_MONOID,
+
+ FOLDL_SCALAR_VECTOR_MASK_MONOID,
+
+ EWISELAMBDA,
+
+ BUILD_VECTOR,
+
+ BUILD_VECTOR_WITH_VALUES,
+
+ SIZE,
+
+ NROWS,
+
+ NCOLS,
+
+ EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+
+ EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+
+ EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+
+ EWISEAPPLY_VECTOR_VECTOR_VECTOR_OP,
+
+ EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+
+ EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+
+ EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+
+ EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+
+ EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+
+ EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+
+ EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+
+ EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+
+ EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+
+ EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+
+ EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+
+ EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+
+ EWISE_MUL_ADD,
+
+ EWISE_MUL_ADD_FOUR_VECTOR,
+
+ EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+
+ EWISE_MUL_ADD_THREE_VECTOR_CHI,
+
+ EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+
+ EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+
+ EWISE_MUL_ADD_THREE_VECTOR_BETA,
+
+ EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+
+ EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+
+ EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+
+ EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+
+ EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+
+ SET_MATRIX_MATRIX,
+
+ SET_MATRIX_MATRIX_INPUT2,
+
+ MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+
+ MXM_MATRIX_MATRIX_MATRIX_MONOID,
+
+ OUTER,
+
+ UNZIP_VECTOR_VECTOR_VECTOR,
+
+ ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+
+ ZIP_MATRIX_VECTOR_VECTOR,
+
+ CLEAR_MATRIX,
+
+ EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+
+ EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+
+ EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+
+ EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+
+ EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+
+ EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+
+ VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+
+ VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+ VXM_VECTOR_VECTOR_MATRIX_RING,
+
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+
+ MXV_VECTOR_MATRIX_VECTOR_RING,
+
+ MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+
+ BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+
+ CAPACITY_VECTOR,
+
+ CAPACITY_MATRIX,
+
+ RESIZE,
+
+ RESIZE_MATRIX,
+
+ GETID_VECTOR,
+
+ GETID_MATRIX,
+
+ EWISELAMBDA_FUNC_MATRIX,
+
+ VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+ VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+ VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+
+ FOLDL_VECTOR_BETA_OP,
+
+ FOLDL_VECTOR_VECTOR_BETA_OP,
+
+ FOLDL_VECTOR_BETA_MONOID,
+
+ FOLDL_VECTOR_VECTOR_BETA_MONOID,
+
+ FOLDL_VECTOR_VECTOR_MONOID,
+
+ FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+
+ FOLDL_VECTOR_VECTOR_VECTOR_OP,
+
+ FOLDL_VECTOR_VECTOR_OP,
+
+ FOLDR_APLHA_VECTOR_MONOID,
+
+ FOLDR_APLHA_VECTOR_OPERATOR,
+
+ FOLDR_VECTOR_VECTOR_OPERATOR,
+
+ FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+
+ FOLDR_VECTOR_VECTOR_MONOID,
+
+ FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+
+ EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+
+ EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+
+ EWISEMUL_VECTOR_VECTOR_BETA_RING,
+
+ EWISEMUL_VECTOR_ALPHA_BETA_RING,
+
+ EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+
+ EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+
+ EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+
+ EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+
+ EWISELAMBDA_FUNC_VECTOR
+
+ };
+
+ /** \internal How many operation vertex types exist. */
+ const constexpr size_t numOperationVertexTypes = 106;
+
+ /** \internal An array of all operation vertex types. */
+ const constexpr enum OperationVertexType
+ allOperationVertexTypes[ numOperationVertexTypes ] =
+ {
+ NNZ_VECTOR,
+ NNZ_MATRIX,
+ CLEAR_VECTOR,
+ SET_VECTOR_ELEMENT,
+ DOT,
+ SET_USING_VALUE,
+ SET_USING_MASK_AND_VECTOR,
+ SET_USING_MASK_AND_SCALAR,
+ SET_FROM_VECTOR,
+ ZIP,
+ E_WISE_APPLY_VECTOR_VECTOR_VECTOR_OP,
+ FOLDR_VECTOR_SCALAR_MONOID,
+ FOLDR_VECTOR_MASK_SCALAR_MONOID,
+ FOLDL_SCALAR_VECTOR_MONOID,
+ FOLDL_SCALAR_VECTOR_MASK_MONOID,
+ EWISELAMBDA,
+ BUILD_VECTOR,
+ BUILD_VECTOR_WITH_VALUES,
+ SIZE,
+ NROWS,
+ NCOLS,
+ EWISEAPPLY_VECTOR_ALPHA_BETA_OP,
+ EWISEAPPLY_VECTOR_ALPHA_VECTOR_OP,
+ EWISEAPPLY_VECTOR_VECTOR_BETA_OP,
+ EWISEAPPLY_VECTOR_VECTOR_VECTOR_OP,
+ EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_OP,
+ EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_OP,
+ EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_OP,
+ EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_OP,
+ EWISEAPPLY_VECTOR_ALPHA_BETA_MONOID,
+ EWISEAPPLY_VECTOR_ALPHA_VECTOR_MONOID,
+ EWISEAPPLY_VECTOR_VECTOR_BETA_MONOID,
+ EWISEAPPLY_VECTOR_VECTOR_VECTOR_MONOID,
+ EWISEAPPLY_VECTOR_MASK_ALPHA_BETA_MONOID,
+ EWISEAPPLY_VECTOR_MASK_ALPHA_VECTOR_MONOID,
+ EWISEAPPLY_VECTOR_MASK_VECTOR_BETA_MONOID,
+ EWISEAPPLY_VECTOR_MASK_VECTOR_VECTOR_MONOID,
+ EWISE_MUL_ADD,
+ EWISE_MUL_ADD_FOUR_VECTOR,
+ EWISE_MUL_ADD_THREE_VECTOR_ALPHA,
+ EWISE_MUL_ADD_THREE_VECTOR_CHI,
+ EWISE_MUL_ADD_FOUR_VECTOR_CHI,
+ EWISE_MUL_ADD_FOUR_VECTOR_CHI_RING,
+ EWISE_MUL_ADD_THREE_VECTOR_BETA,
+ EWISE_MUL_ADD_THREE_VECTOR_ALPHA_GAMMA,
+ EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA,
+ EWISE_MUL_ADD_TWO_VECTOR_ALPHA_BETA_GAMMA,
+ EWISEAPPLY_MATRIX_MATRIX_MATRIX_MULMONOID_PHASE,
+ EWISEAPPLY_MATRIX_MATRIX_MATRIX_OPERATOR_PHASE,
+ SET_MATRIX_MATRIX,
+ SET_MATRIX_MATRIX_INPUT2,
+ MXM_MATRIX_MATRIX_MATRIX_SEMIRING,
+ MXM_MATRIX_MATRIX_MATRIX_MONOID,
+ OUTER,
+ UNZIP_VECTOR_VECTOR_VECTOR,
+ ZIP_MATRIX_VECTOR_VECTOR_VECTOR,
+ ZIP_MATRIX_VECTOR_VECTOR,
+ CLEAR_MATRIX,
+ EWISEMULADD_VECTOR_VECTOR_VECTOR_GAMMA_RING,
+ EWISEMULADD_VECTOR_VECTOR_BETA_GAMMA_RING,
+ EWISEMULADD_VECTOR_ALPHA_VECTOR_GAMMA_RING,
+ EWISEMULADD_VECTOR_ALPHA_BETA_VECTOR_RING,
+ EWISEMULADD_VECTOR_ALPHA_BETA_GAMMA_RING,
+ EWISEMULADD_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+ VXM_VECTOR_VECTOR_VECTOR_MATRIX,
+ VXM_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ VXM_VECTOR_VECTOR_MATRIX_RING,
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_RING,
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_R,
+ MXV_VECTOR_VECTOR_MATRIX_VECTOR_VECTOR_A,
+ MXV_VECTOR_MATRIX_VECTOR_RING,
+ MXV_VECTOR_MATRIX_VECTOR_ADD_MUL,
+ BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+ CAPACITY_VECTOR,
+ CAPACITY_MATRIX,
+ RESIZE,
+ RESIZE_MATRIX,
+ GETID_VECTOR,
+ GETID_MATRIX,
+ EWISELAMBDA_FUNC_MATRIX,
+ VXM_GENERIC_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ VXM_VECTOR_VECTOR_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ VXM_VECTOR_VECTOR_MATRIX_ADD_MUL,
+ FOLDL_VECTOR_BETA_OP,
+ FOLDL_VECTOR_VECTOR_BETA_OP,
+ FOLDL_VECTOR_BETA_MONOID,
+ FOLDL_VECTOR_VECTOR_BETA_MONOID,
+ FOLDL_VECTOR_VECTOR_MONOID,
+ FOLDL_VECTOR_VECTOR_VECTOR_MONOID,
+ FOLDL_VECTOR_VECTOR_VECTOR_OP,
+ FOLDL_VECTOR_VECTOR_OP,
+ FOLDR_APLHA_VECTOR_MONOID,
+ FOLDR_APLHA_VECTOR_OPERATOR,
+ FOLDR_VECTOR_VECTOR_OPERATOR,
+ FOLDR_VECTOR_VECTOR_VECTOR_OPERATOR,
+ FOLDR_VECTOR_VECTOR_MONOID,
+ FOLDR_VECTOR_VECTOR_VECTOR_MONOID,
+ EWISEMUL_VECTOR_VECTOR_VECTOR_RING,
+ EWISEMUL_VECTOR_ALPHA_VECTOR_RING,
+ EWISEMUL_VECTOR_VECTOR_BETA_RING,
+ EWISEMUL_VECTOR_ALPHA_BETA_RING,
+ EWISEMUL_VECTOR_VECTOR_VECTOR_VECTOR_RING,
+ EWISEMUL_VECTOR_VECTOR_ALPHA_VECTOR_RING,
+ EWISEMUL_VECTOR_VECTOR_VECTOR_BETA_RING,
+ EWISEMUL_VECTOR_VECTOR_ALPHA_BETA_RING,
+ EWISELAMBDA_FUNC_VECTOR
+ };
+
+ /** \internal @returns The operation vertex type as a string. */
+ std::string toString( const enum OperationVertexType ) noexcept;
+
+ /** \internal An operation vertex */
+ class OperationVertex {
+
+ private:
+
+ /** \internal The type of the vertex. */
+ const enum OperationVertexType type;
+
+ /** \internal The ID amongst vertices of the same type. */
+ const size_t local_id;
+
+ /** \internal The ID amongst all vertices. */
+ const size_t global_id;
+
+
+ public:
+
+ /**
+ * \internal
+ * Base constructor.
+ *
+ * @param[in] type The type of the new operation vertex.
+ * @param[in] lid An ID amongst vertices of the same type.
+ * @param[in] gid An ID unique amongst all vertices.
+ * \endinternal
+ */
+ OperationVertex(
+ const enum OperationVertexType type,
+ const size_t lid, const size_t gid
+ ) noexcept;
+
+ /** \internal @returns The type of this vertex. */
+ enum OperationVertexType getType() const noexcept;
+
+ /**
+ * \internal
+ * @returns An ID unique amongst all vertices of the same type.
+ * \endinternal
+ */
+ size_t getLocalID() const noexcept;
+
+ /**
+ * \internal
+ * @returns An ID unique amongst all vertices, regardless of type.
+ * \endinternal
+ */
+ size_t getGlobalID() const noexcept;
+
+ };
+
+ /** \internal Helps generate operation vertices. */
+ class OperationVertexGenerator {
+
+ private:
+
+ /**
+ * \internal
+ * A map that keeps track of the number of vertices of each type.
+ * \endinternal
+ */
+ std::map< enum OperationVertexType, size_t > nextID;
+
+
+ public:
+
+ /** \internal Base constructor. */
+ OperationVertexGenerator();
+
+ /**
+ * \internal
+ *
+ * @param[in] type type of the new operation vertex
+ * @param[in] id a unique global ID
+ *
+ * @returns a new output vertex with an unique local ID
+ *
+ * \endinternal
+ */
+ OperationVertex create(
+ const OperationVertexType type,
+ const size_t id
+ );
+
+ /**
+ * \internal
+ *
+ * @returns The total number of output vertices generated.
+ *
+ * \endinternal
+ */
+ size_t size() const;
+
+ };
+
+ /**
+ * \internal
+ *
+ * Encodes any directed hypergraph that may yet grow.
+ *
+ * \endinternal
+ */
+ class DHypergraph {
+
+ private:
+
+ /** \internal The total number of vertices in the hypergraph. */
+ size_t num_vertices;
+
+ /**
+ * \internal
+ *
+ * All hyperedges in the hypergraph.
+ *
+ * \endinternal
+ */
+ std::map< size_t, std::set< size_t > > hyperedges;
+
+ /** \internal The total number of pins in the hypergraph. */
+ size_t num_pins;
+
+
+ public:
+
+ DHypergraph() noexcept;
+
+ /**
+ * \internal
+ *
+ * @param[in] start The iterator over vertex IDs that need be added into
+ * the hypergraph.
+ * @param[in] end The end iterator over the vertex IDs to be added.
+ *
+ * There must be at least one vertex ID added, or undefined behaviour will
+ * occur.
+ *
+ * Non-unique elements in the IDs to be added will be filtered out.
+ *
+ * Performance is log-linear in the number of IDs to be added.
+ * \endinternal
+ */
+ template< typename FwdIt >
+ void appendHyperedge(
+ const size_t source,
+ FwdIt start, const FwdIt &end
+ ) {
+ static_assert( std::is_unsigned<
+ typename std::iterator_traits< FwdIt >::value_type
+ >::value, "Expected an iterator over positive integral values" );
+#ifdef _DEBUG
+ std::cerr << "in appendHyperedge\n\t source " << source
+ << "\n\t adds destinations ( ";
+ std::vector< size_t > warn;
+#endif
+ const auto it = hyperedges.find( source );
+ if( it == hyperedges.end() ) {
+ hyperedges[ source ] = std::set< size_t >();
+ }
+
+ std::set< size_t > &toAdd = hyperedges[ source ];
+ for( ; start != end; ++start ) {
+ assert( *start < num_vertices );
+ if( toAdd.find( static_cast< size_t >( *start ) ) == toAdd.end() ) {
+ toAdd.insert( *start );
+ (void) ++num_pins;
+#ifdef _DEBUG
+ std::cerr << *start << " ";
+#endif
+ } else {
+#ifdef _DEBUG
+ warn.push_back( *start );
+#endif
+ }
+ }
+#ifdef _DEBUG
+ std::cerr << ")\n";
+ if( warn.size() > 0 ) {
+ std::cerr << "\t Warning: the following edges were multiply-defined: ( ";
+ for( const auto &id : warn ) {
+ std::cerr << id << " ";
+ }
+ }
+ std::cerr << ")\n\t exiting\n";
+#endif
+ }
+
+ /**
+ * \internal
+ *
+ * Creates a new vertex and returns its global ID.
+ *
+ * \endinternal
+ */
+ size_t createVertex() noexcept;
+
+ /** \internal @returns The number of vertices in the current graph. */
+ size_t numVertices() const noexcept;
+
+ /** \internal @returns The number of hyperedges in the current graph. */
+ size_t numHyperedges() const noexcept;
+
+ /** \internal @returns The total number of pins in the current graph. */
+ size_t numPins() const noexcept;
+
+ /**
+ * \internal
+ *
+ * Prints the hypergraph to a given output stream as a series of
+ * hyperedges. The output format is MatrixMarket-like, where every
+ * hyperedge is assigned a unique ID, and every hyperedge-to-vertex pair
+ * then is printed to \a out.
+ *
+ * @param[in,out] out Where to print the hypergraph to.
+ *
+ * \endinternal
+ */
+ void render( std::ostream &out ) const;
+
+ };
+
+ /** \internal Represents a finalised HyperDAG */
+ class HyperDAG {
+
+ friend class HyperDAGGenerator;
+
+ private:
+
+ /** \internal The underlying hypergraph. */
+ DHypergraph hypergraph;
+
+ /** \internal The number of source vertices. */
+ size_t num_sources;
+
+ /** \internal The number of operation vertices. */
+ size_t num_operations;
+
+ /** \internal The number of output vertices. */
+ size_t num_outputs;
+
+ /** \internal A vector of source vertices. */
+ std::vector< SourceVertex > sourceVertices;
+
+ /** \internal A vector of operation vertices. */
+ std::vector< OperationVertex > operationVertices;
+
+ /** \internal A vector of output vertices. */
+ std::vector< OutputVertex > outputVertices;
+
+ /** \internal A map from source vertex IDs to global IDs. */
+ std::map< size_t, size_t > source_to_global_id;
+
+ /** \internal A map from operation vertex IDs to global IDs. */
+ std::map< size_t, size_t > operation_to_global_id;
+
+ /** \internal A map from output vertex IDs to global IDs. */
+ std::map< size_t, size_t > output_to_global_id;
+
+ /** \internal A map from global IDs to their types. */
+ std::map< size_t, enum VertexType > global_to_type;
+
+ /** \internal A map from global IDs to their local IDs. */
+ std::map< size_t, size_t > global_to_local_id;
+
+ /**
+ * \internal
+ *
+ * Base constructor.
+ *
+ * @param[in] _hypergraph The base hypergraph.
+ * @param[in] _srcVec Vector of source vertices.
+ * @param[in] _opVec Vector of operation vertices.
+ * @param[in] _outVec Vector of output vertices.
+ */
+ HyperDAG(
+ DHypergraph _hypergraph,
+ const std::vector< SourceVertex > &_srcVec,
+ const std::vector< OperationVertex > &_opVec,
+ const std::vector< OutputVertex > &_outVec
+ );
+
+
+ public:
+
+
+ /** @returns The hypergraph representation of the HyperDAG. */
+ DHypergraph get() const noexcept;
+
+ /** @returns The number of source vertices. */
+ size_t numSources() const noexcept;
+
+ /** @returns The number of operation vertices. */
+ size_t numOperations() const noexcept;
+
+ /** @returns The number of output vertices. */
+ size_t numOutputs() const noexcept;
+
+ /** @returns A start iterator to the source vertices. */
+ std::vector< SourceVertex >::const_iterator sourcesBegin() const;
+
+ /** @returns End iterator matching #sourcesBegin(). */
+ std::vector< SourceVertex >::const_iterator sourcesEnd() const;
+
+ /** @returns A start iterator to the output vertices. */
+ std::vector< OperationVertex >::const_iterator operationsBegin() const;
+
+ /** @returns End iterator matching #outputsBegin. */
+ std::vector< OperationVertex >::const_iterator operationsEnd() const;
+
+ /** @returns A start iterator to the output vertices. */
+ std::vector< OutputVertex >::const_iterator outputsBegin() const;
+
+ /** @returns End iterator matching #outputsBegin. */
+ std::vector< OutputVertex >::const_iterator outputsEnd() const;
+
+ };
+
+ /** \internal Builds a HyperDAG representation of an ongoing computation. */
+ class HyperDAGGenerator {
+
+ private:
+
+ /** \internal The hypergraph under construction. */
+ DHypergraph hypergraph;
+
+ /**
+ * \internal
+ *
+ * Once new source vertices are created, they are recorded here. This
+ * storage differs from #sourceVertices in that the latter only keeps
+ * track of currently active source vertices, and identifies them by
+ * a pointer.
+ *
+ * \endinternal
+ */
+ std::vector< SourceVertex > sourceVec;
+
+ /**
+ * \internal
+ *
+ * Once new operation vertices are created, they are recorded here. This
+ * storage differs from #operationVertices in that the latter only keeps
+ * track of currently active source vertices, and identifies them by
+ * a pointer.
+ *
+ * \endinternal
+ */
+ std::vector< OperationVertex > operationVec;
+
+ /** \internal Map of pointers to source vertices. */
+ std::map< const void *, SourceVertex > sourceVerticesP;
+
+ /** \internal Map of IDs to source vertices. */
+ std::map< uintptr_t, SourceVertex > sourceVerticesC;
+
+ /** \internal Map of IDs to operation vertices. */
+ std::map< uintptr_t, OperationVertex > operationVertices;
+
+ // note: there is no map of OutputVertices because only at the point we
+ // finalize to generate the final HyperDAG do we know for sure what
+ // the output vertices are. The same applies to an `outputVec`.
+
+ /**
+ * \internal
+ *
+ * During a computation, once an operation executes, its output container
+ * may be an intermediate result or an output. For as long as it is unknown
+ * which it is, those pointers are registered here. Each vertex here must
+ * be assigned a global ID, which are stored as values in this map.
+ *
+ * \endinternal
+ */
+ std::map< uintptr_t,
+ std::pair< size_t, OperationVertexType >
+ > operationOrOutputVertices;
+
+ /** \internal Source vertex generator. */
+ SourceVertexGenerator sourceGen;
+
+ /** \internal Operation vertex generator. */
+ OperationVertexGenerator operationGen;
+
+ // OutputVertexGenerator is a local field of #finalize()
+
+ /**
+ * \internal
+ * Adds a source vertex to the hypergraph.
+ *
+ * @param[in] type The type of source vertex.
+ * @param[in] pointer A unique identifier of the source.
+ * @param[in] id A unique identifier of the source.
+ *
+ * If the \a type corresponds to an ALP/GraphBLAS container, then
+ * \a pointer is ignored; otherwise, \a id is ignored.
+ * \endinternal
+ */
+ size_t addAnySource(
+ const SourceVertexType type,
+ const void * const pointer,
+ const uintptr_t id
+ );
+
+
+ public:
+
+ /**
+ * \internal Base constructor.
+ */
+ HyperDAGGenerator() noexcept;
+
+ /**
+ * \internal
+ *
+ * Sometimes a given \em operation generates a source vertex-- for example,
+ * the scalar input/output argument to grb::dot.
+ *
+ * In such cases, this function should be called to register the source
+ * vertex.
+ *
+ * @param[in] type The type of source vertex
+ * @param[in] pointer A unique identifier corresponding to the source
+ *
+ * \warning \a type cannot be #SourceVertexType::CONTAINER-- such source
+ * vertices should be automatically resolved via #addOperation.
+ *
+ * \endinternal
+ */
+ void addSource(
+ const SourceVertexType type,
+ const void * const pointer
+ );
+
+ /**
+ * \internal
+ *
+ * Registers a new source container with a given \a id.
+ *
+ * \endinternal
+ */
+ void addContainer( const uintptr_t id );
+
+ /**
+ * \internal
+ *
+ * Registers a new operation with the HyperDAG.
+ *
+ * @param[in] type The type of operation being registered.
+ * @param[in] src_start, src_end Iterators to a set of source pointers.
+ * @param[in] dst_start, dst_end Iterators to a set of destination pointers.
+ *
+ * This function proceeds as follows:
+ * 1. for source pointers in #operationOrOutputVertices, a) upgrade them
+ * to #OperationVertex, and b) add them to #operationVertices. For
+ * source pointers in #operationVertices, do nothing.
+ * 2. for remaining source pointers that are not in #sourceVertices,
+ * upgrade them to #SourceVertex and add them to #sourceVertices.
+ * Otherwise, if already a source, add it from #sourceVertices
+ * directly.
+ * 3. for every source pointer k, build an hyperedge. Each hyperedge
+ * contains only one entry at this point, namely the global ID
+ * corresponding to each of the k source pointers.
+ * 4. if destination pointers already existed within this HyperDAG, the
+ * current operation does not correspond to the same ones-- we need
+ * to create new ones for them. Therefore, we first remove old
+ * copies. Note that destinations that also dubbed as sources are now
+ * safe to remove, because we already processed the source pointers.
+ * 5. Assign all destination pointers a new global ID, and add them to
+ * #operationOrOutputVertices.
+ * 6. Assign all these new global IDs to each of the k hyperedges that
+ * step 3 started to construct. Thus if there are l destination,
+ * pointers, we now have k hyperedges with l+1 entries each.
+ * 7. Store those k hyperedges and exit.
+ *
+ * \warning For in-place operations, the output container must be given
+ * both as a source \em and destination pointer.
+ *
+ * \endinternal
+ */
+ template< typename SrcPIt, typename SrcCIt, typename DstIt >
+ void addOperation(
+ const OperationVertexType type,
+ SrcPIt src_p_start, const SrcPIt &src_p_end,
+ SrcCIt src_c_start, const SrcCIt &src_c_end,
+ DstIt dst_start, const DstIt &dst_end
+ ) {
+ static_assert( std::is_same< const void *,
+ typename std::iterator_traits< SrcPIt >::value_type
+ >::value,
+ "Source pointers should be given as const void pointers"
+ );
+ static_assert( std::is_same< uintptr_t,
+ typename std::iterator_traits< DstIt >::value_type
+ >::value,
+ "Destinations should be identified by their IDs"
+ );
+ static_assert( std::is_same< uintptr_t,
+ typename std::iterator_traits< SrcCIt >::value_type
+ >::value,
+ "Source containers should be identified by their IDs"
+ );
+
+#ifdef _DEBUG
+ std::cerr << "In HyperDAGGen::addOperation( "
+ << toString( type ) << ", ... )\n"
+ << "\t sourceVertices size: " << sourceVerticesP.size() << " pointers + "
+ << sourceVerticesC.size() << " containers\n"
+ << "\t sourceVec size: " << sourceVec.size() << "\n";
+#endif
+
+ // steps 1, 2, and 3
+ std::vector< std::pair< size_t, std::set< size_t > > > hyperedges;
+ for( ; src_p_start != src_p_end; ++src_p_start ) {
+#ifdef _DEBUG
+ std::cerr << "\t processing source pointer " << *src_p_start << "\n";
+#endif
+ // source pointers (input scalars, not input containers) are simple--
+ // they will never appear as operation vertices, nor as output vertices.
+ // Therefore step 1 does not apply.
+
+ // step 2
+ size_t sourceID;
+ const auto alreadySource = sourceVerticesP.find( *src_p_start );
+ if( alreadySource == sourceVerticesP.end() ) {
+#ifndef NDEBUG
+ const bool all_sources_should_already_be_added = false;
+ assert( all_sources_should_already_be_added );
+#endif
+ std::cerr << "Warning: unidentified source " << *src_p_start << ". "
+ << "Adding it as an input scalar.\n";
+ sourceID = addAnySource( SCALAR, *src_p_start, 0 );
+ } else {
+#ifdef _DEBUG
+ std::cerr << "\t found source in sourceVertices\n";
+#endif
+ sourceID = alreadySource->second.getGlobalID();
+ }
+ // step 3
+ hyperedges.push_back( std::make_pair( sourceID, std::set< size_t >() ) );
+ }
+ for( ; src_c_start != src_c_end; ++src_c_start ) {
+#ifdef _DEBUG
+ std::cerr << "\t processing source container " << *src_c_start << "\n";
+#endif
+ // step 1
+ size_t sourceID;
+ const auto &it = operationOrOutputVertices.find( *src_c_start );
+ const auto &it2 = operationVertices.find( *src_c_start );
+ if( it2 != operationVertices.end() ) {
+ // operation vertices are fine as a source -- no additional operations
+ // necessary
+ assert( it == operationOrOutputVertices.end() );
+#ifdef _DEBUG
+ std::cerr << "\t source was previously an operation\n";
+#endif
+ sourceID = it2->second.getGlobalID();
+ } else if( it == operationOrOutputVertices.end() ) {
+ // step 2
+ const auto alreadySource = sourceVerticesC.find( *src_c_start );
+ if( alreadySource == sourceVerticesC.end() ) {
+#ifndef NDEBUG
+ const bool all_sources_should_already_be_added = false;
+ assert( all_sources_should_already_be_added );
+#endif
+ std::cerr << "Warning: unidentified source " << *src_c_start << ". "
+ << "Adding it as a container.\n";
+ sourceID = addAnySource( CONTAINER, nullptr, *src_c_start );
+ } else {
+#ifdef _DEBUG
+ std::cerr << "\t found source in sourceVertices\n";
+#endif
+ sourceID = alreadySource->second.getGlobalID();
+ }
+ } else {
+#ifdef _DEBUG
+ std::cerr << "\t found source in operationOrOutputVertices\n";
+#endif
+ // step 2
+ const auto &remove = operationVertices.find( it->first );
+ if( remove != operationVertices.end() ) {
+#ifdef _DEBUG
+ std::cerr << "\t found source in operationVertices; removing it\n";
+#endif
+ operationVertices.erase( remove );
+ }
+#ifdef _DEBUG
+ std::cerr << "\t creating new entry in operationOrOutputVertices\n";
+#endif
+ const size_t global_id = it->second.first;
+ const auto &operationVertex = operationGen.create(
+ it->second.second, global_id
+ );
+ operationVertices.insert( std::make_pair( it->first, operationVertex ) );
+ operationVec.push_back( operationVertex );
+ operationOrOutputVertices.erase( it );
+ sourceID = global_id;
+ }
+ // step 3
+ hyperedges.push_back( std::make_pair( sourceID, std::set< size_t >() ) );
+ }
+
+
+ // step 4, 5, and 6
+ for( ; dst_start != dst_end; ++dst_start ) {
+#ifdef _DEBUG
+ std::cerr << "\t processing destination " << *dst_start << "\n";
+#endif
+ // step 4
+ {
+ const auto &it = sourceVerticesC.find( *dst_start );
+ if( it != sourceVerticesC.end() ) {
+#ifdef _DEBUG
+ std::cerr << "\t destination found in sources-- "
+ << "removing it from there\n";
+#endif
+ sourceVerticesC.erase( it );
+ }
+ }
+ {
+ const auto &it = operationVertices.find( *dst_start );
+ if( it != operationVertices.end() ) {
+#ifdef _DEBUG
+ std::cerr << "\t destination found in operations-- "
+ << "removing it from there\n";
+#endif
+ operationVertices.erase( it );
+ }
+ }
+ {
+ const auto &it = operationOrOutputVertices.find( *dst_start );
+ if( it != operationOrOutputVertices.end() ) {
+ std::cerr << "WARNING (hyperdags::addOperation): an unconsumed output "
+ << "container was detected. This indicates the existance of "
+ << "an ALP primitive whose output is never used.\n";
+#ifdef _DEBUG
+ std::cerr << "\t destination found in operationsOrOutput-- "
+ << "removing it from there\n";
+#endif
+ operationOrOutputVertices.erase( it );
+ }
+ }
+ // step 5
+ const size_t global_id = hypergraph.createVertex();
+ operationOrOutputVertices.insert(
+ std::make_pair( *dst_start,
+ std::make_pair( global_id, type )
+ )
+ );
+#ifdef _DEBUG
+ std::cerr << "\t created a new operation vertex with global ID "
+ << global_id << "\n";
+#endif
+ // step 6
+ for( auto &hyperedge : hyperedges ) {
+ hyperedge.second.insert( global_id );
+ }
+ }
+
+ // step 7
+ for( const auto &hyperedge : hyperedges ) {
+#ifdef _DEBUG
+ std::cerr << "\t storing a hyperedge of size "
+ << (hyperedge.second.size()+1) << "\n";
+#endif
+ hypergraph.appendHyperedge(
+ hyperedge.first,
+ hyperedge.second.begin(), hyperedge.second.end()
+ );
+ }
+ }
+
+ /**
+ * \internal
+ *
+ * Assumes that all remaining vertices in #operationVertexOrOutputVertex
+ * are of type #OutputVertex. It then generates a finalised HyperDAG.
+ *
+ * @returns The resulting HyperDAG.
+ *
+ * The current generator instance is left unmodified; this function takes
+ * a snapshot of the current state, and allows its further extension.
+ *
+ * \endinternal
+ */
+ HyperDAG finalize() const;
+
+ };
+
+ } // end namespace grb::internal::hyperdags
+
+ } // end namespace grb::internal
+
+} // end namespace grb
+
+#endif // end _H_GRB_HYPERDAGS_STATE
+
diff --git a/include/graphblas/hyperdags/init.hpp b/include/graphblas/hyperdags/init.hpp
new file mode 100644
index 000000000..4afbb3765
--- /dev/null
+++ b/include/graphblas/hyperdags/init.hpp
@@ -0,0 +1,55 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides initialisers for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_INIT
+#define _H_GRB_HYPERDAGS_INIT
+
+#include
+
+
+namespace grb {
+
+ namespace internal {
+
+ namespace hyperdags {
+
+ /** Singleton generator instance. */
+ extern HyperDAGGenerator generator;
+
+ }
+
+ }
+
+ template<>
+ RC init< hyperdags >( const size_t, const size_t, void * const );
+
+ template<>
+ RC finalize< hyperdags >();
+
+} // end namespace grb
+
+#endif
+
diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp
new file mode 100644
index 000000000..e68af3eb7
--- /dev/null
+++ b/include/graphblas/hyperdags/io.hpp
@@ -0,0 +1,562 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the I/O primitives for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March 2022
+ */
+
+#include
+
+#include
+
+
+namespace grb {
+
+ // input:
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType, typename fwd_iterator, typename Coords,
+ class Dup = operators::right_assign< InputType >
+ >
+ RC buildVector(
+ Vector< InputType, hyperdags, Coords > &x,
+ fwd_iterator start, const fwd_iterator end,
+ const IOMode mode, const Dup &dup = Dup()
+ ) {
+ const RC ret = buildVector(
+ internal::getVector(x), start, end, mode, dup
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::ITERATOR,
+ &start
+ );
+ std::array< const void *, 1 > sourcesP{ &start };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::BUILD_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType, typename fwd_iterator1, typename fwd_iterator2,
+ typename Coords, class Dup = operators::right_assign< InputType >
+ >
+ RC buildVector(
+ Vector< InputType, hyperdags, Coords > &x,
+ fwd_iterator1 ind_start, const fwd_iterator1 ind_end,
+ fwd_iterator2 val_start, const fwd_iterator2 val_end,
+ const IOMode mode,
+ const Dup &dup = Dup()
+ ) {
+ const RC ret = buildVector< descr >(
+ internal::getVector(x), ind_start, ind_end, val_start, val_end, mode, dup
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::ITERATOR,
+ &ind_start
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::ITERATOR,
+ &val_start
+ );
+ std::array< const void *, 2 > sourcesP{ &ind_start, &val_start };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::BUILD_VECTOR_WITH_VALUES,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType, typename fwd_iterator
+ >
+ RC buildMatrixUnique(
+ Matrix< InputType, hyperdags > &A,
+ fwd_iterator start,
+ const fwd_iterator end,
+ const IOMode mode
+ ) {
+ const RC ret = buildMatrixUnique< descr >(
+ internal::getMatrix(A), start, end, mode
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( ncols( A ) == 0 || nrows( A ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::ITERATOR,
+ &start
+ );
+ std::array< const void *, 1 > sourcesP{ &start };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::BUILDMATRIXUNIQUE_MATRIX_START_END_MODE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType,
+ typename T, typename Coords
+ >
+ RC setElement(
+ Vector< DataType, hyperdags, Coords > &x,
+ const T val,
+ const size_t i,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value &&
+ !grb::is_object< T >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = setElement< descr >(
+ internal::getVector( x ), val, i, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ // x cannot be empty here or setElement would have failed-- no need to catch
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &val
+ );
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::USER_INT,
+ &i
+ );
+ std::array< const void *, 2 > sourcesP{ &val, &i };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_VECTOR_ELEMENT,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType, typename Coords,
+ typename T
+ >
+ RC set(
+ Vector< DataType, hyperdags, Coords > &x, const T val,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value &&
+ !grb::is_object< T >::value,
+ void >::type * const = nullptr
+ ) {
+ const RC ret = set< descr >( internal::getVector( x ), val, phase );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ if( !(descr & descriptors::use_index) ) {
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &val
+ );
+ std::array< const void *, 1 > sourcesP{ &val };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_USING_VALUE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ } else {
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_USING_VALUE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ }
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType, typename MaskType, typename T,
+ typename Coords
+ >
+ RC set(
+ Vector< DataType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &m,
+ const T val,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value &&
+ !grb::is_object< T >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( m ) == 0 ) { return set< descr >( x, val, phase ); }
+ const RC ret = set< descr >(
+ internal::getVector(x), internal::getVector(m),
+ val, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &val
+ );
+ std::array< const void *, 1 > sourcesP{ &val };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(x) ),
+ getID( internal::getVector(m) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_USING_MASK_AND_SCALAR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename MaskType, typename InputType,
+ typename Coords
+ >
+ RC set(
+ Vector< OutputType, hyperdags, Coords > &x,
+ const Vector< MaskType, hyperdags, Coords > &mask,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value,
+ void >::type * const = nullptr
+ ) {
+ if( size( mask ) == 0 ) { return set< descr >( x, y, phase ); }
+ const RC ret = set< descr >(
+ internal::getVector(x),
+ internal::getVector(mask), internal::getVector(y),
+ phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 3 > sourcesC{
+ getID( internal::getVector(mask) ),
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(x) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_USING_MASK_AND_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType, typename Coords
+ >
+ RC set(
+ Vector< OutputType, hyperdags, Coords > &x,
+ const Vector< InputType, hyperdags, Coords > &y,
+ const Phase &phase = EXECUTE
+ ) {
+ const RC ret = set< descr >(
+ internal::getVector(x), internal::getVector(y), phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getVector(y) ),
+ getID( internal::getVector(x) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_FROM_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType,
+ typename RIT, typename CIT, typename NIT
+ >
+ RC set(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType, hyperdags, RIT, CIT, NIT > &A,
+ const Phase &phase = EXECUTE
+ ) {
+ const RC ret = set< descr >(
+ internal::getMatrix( C ), internal::getMatrix( A ), phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( C ) == 0 || ncols( C ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(C) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_MATRIX_MATRIX,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType, typename InputType1, typename InputType2,
+ typename RIT, typename CIT, typename NIT
+ >
+ RC set(
+ Matrix< OutputType, hyperdags, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, hyperdags, RIT, CIT, NIT > &A,
+ const InputType2 &val,
+ const Phase &phase = EXECUTE
+ ) {
+ const RC ret = set< descr >(
+ internal::getMatrix( C ), internal::getMatrix( A ),
+ val, phase
+ );
+ if( ret != SUCCESS ) { return ret; }
+ if( phase != EXECUTE ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::SCALAR,
+ &val
+ );
+ std::array< const void *, 1 > sourcesP{ &val };
+ std::array< uintptr_t, 2 > sourcesC{
+ getID( internal::getMatrix(A) ),
+ getID( internal::getMatrix(C) )
+ };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::SET_MATRIX_MATRIX_INPUT2,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template< typename DataType, typename Coords >
+ RC clear( Vector< DataType, hyperdags, Coords > &x ) {
+ const RC ret = clear( internal::getVector( x ) );
+ if( ret != SUCCESS ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::CLEAR_VECTOR,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ RC clear( Matrix< InputType, hyperdags, RIT, CIT, NIT > &A ) noexcept {
+ const RC ret = clear( internal::getMatrix(A) );
+ if( ret != SUCCESS ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ std::array< const void *, 0 > sourcesP{};
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::CLEAR_MATRIX,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ // getters:
+
+ template< typename DataType, typename Coords >
+ size_t size( const Vector< DataType, hyperdags, Coords > &x ) {
+ return size (internal::getVector(x));
+ }
+
+ template< typename InputType >
+ size_t nrows( const Matrix< InputType, hyperdags > &A ) noexcept {
+ return nrows(internal::getMatrix(A));
+ }
+
+ template< typename InputType >
+ size_t ncols( const Matrix< InputType, hyperdags > &A ) noexcept {
+ return ncols(internal::getMatrix(A));
+ }
+
+ template< typename DataType, typename Coords >
+ size_t capacity( const Vector< DataType, hyperdags, Coords > &x ) noexcept {
+ return capacity(internal::getVector( x ));
+ }
+
+ template< typename DataType >
+ size_t capacity( const Matrix< DataType, hyperdags > &A ) noexcept {
+ return capacity(internal::getMatrix( A ));
+ }
+
+ template< typename DataType, typename Coords >
+ size_t nnz( const Vector< DataType, hyperdags, Coords > &x ) noexcept {
+ return nnz( internal::getVector( x ) );
+ }
+
+ template< typename InputType >
+ size_t nnz( const Matrix< InputType, hyperdags > &A ) noexcept {
+ return nnz(internal::getMatrix(A));
+ }
+
+ template< typename InputType, typename Coords >
+ uintptr_t getID( const Vector< InputType, hyperdags, Coords > &x ) {
+ return getID(internal::getVector( x ));
+ }
+
+ template< typename InputType >
+ uintptr_t getID( const Matrix< InputType, hyperdags > &A ) {
+ return getID(internal::getMatrix( A ));
+ }
+
+ // resizers:
+
+ template< typename InputType, typename Coords >
+ RC resize(
+ Vector< InputType, hyperdags, Coords > &x,
+ const size_t new_nz
+ ) noexcept {
+ const RC ret = resize( internal::getVector( x ), new_nz );
+ if( ret != SUCCESS ) { return ret; }
+ if( size( x ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::USER_INT,
+ &new_nz
+ );
+ std::array< const void *, 1 > sourcesP{ &new_nz };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getVector(x) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getVector(x) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::RESIZE,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ template< typename InputType >
+ RC resize(
+ Matrix< InputType, hyperdags > &A,
+ const size_t new_nz
+ ) noexcept {
+ const RC ret = resize( internal::getMatrix(A), new_nz );
+ if( ret != SUCCESS ) { return ret; }
+ if( nrows( A ) == 0 || ncols( A ) == 0 ) { return ret; }
+ internal::hyperdags::generator.addSource(
+ internal::hyperdags::USER_INT,
+ &new_nz
+ );
+ std::array< const void *, 1 > sourcesP{ &new_nz };
+ std::array< uintptr_t, 1 > sourcesC{ getID( internal::getMatrix(A) ) };
+ std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(A) ) };
+ internal::hyperdags::generator.addOperation(
+ internal::hyperdags::RESIZE_MATRIX,
+ sourcesP.begin(), sourcesP.end(),
+ sourcesC.begin(), sourcesC.end(),
+ destinations.begin(), destinations.end()
+ );
+ return ret;
+ }
+
+ // nonblocking I/O:
+
+ template<>
+ RC wait< hyperdags >();
+
+ /** \internal Dispatch to base wait implementation */
+ template<
+ typename InputType, typename Coords,
+ typename ... Args
+ >
+ RC wait(
+ const Vector< InputType, hyperdags, Coords > &x,
+ const Args &... args
+ ) {
+ (void) x;
+ return wait( args... );
+ }
+
+ /** \internal Dispatch to base wait implementation */
+ template< typename InputType, typename... Args >
+ RC wait(
+ const Matrix< InputType, hyperdags > &A,
+ const Args &... args
+ ) {
+ (void) A;
+ return wait( args... );
+ }
+
+} // namespace grb
+
diff --git a/include/graphblas/hyperdags/matrix.hpp b/include/graphblas/hyperdags/matrix.hpp
new file mode 100644
index 000000000..a80602bb6
--- /dev/null
+++ b/include/graphblas/hyperdags/matrix.hpp
@@ -0,0 +1,286 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the matrix container for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 3rd of March, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_MATRIX
+#define _H_GRB_HYPERDAGS_MATRIX
+
+#include
+
+
+namespace grb {
+
+ namespace internal {
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+ Matrix< T, grb::hyperdags, RIT, CIT, NIT > &
+ );
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ const Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+ const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+ );
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline internal::Compressed_Storage<
+ T, RIT, NIT
+ > & getCRS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline const internal::Compressed_Storage<
+ T, RIT, NIT
+ > & getCRS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline internal::Compressed_Storage<
+ T, CIT, NIT
+ > & getCCS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline const internal::Compressed_Storage<
+ T, CIT, NIT
+ > & getCCS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept;
+
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ class Matrix< T, hyperdags, RIT, CIT, NIT > {
+
+ template< typename A, typename sRIT, typename sCIT, typename sNIT >
+ friend Matrix<
+ A, _GRB_WITH_HYPERDAGS_USING, sRIT, sCIT, sNIT
+ > & internal::getMatrix(
+ Matrix< A, grb::hyperdags, sRIT, sCIT, sNIT > &
+ );
+
+ template< typename A, typename sRIT, typename sCIT, typename sNIT >
+ friend const Matrix<
+ A, _GRB_WITH_HYPERDAGS_USING, sRIT, sCIT, sNIT
+ > & internal::getMatrix(
+ const Matrix< A, grb::hyperdags, sRIT, sCIT, sNIT > &
+ );
+
+
+ private:
+
+ /** \internal My own type */
+ typedef Matrix< T, hyperdags, RIT, CIT, NIT > SelfType;
+
+ /** \internal Simply use an underlying implementation */
+ typedef Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > MyMatrixType;
+
+ /** \internal Underlying matrix */
+ MyMatrixType matrix;
+
+ /** \internal Register this matrices */
+ void register_matrix() {
+#ifdef _DEBUG
+ std::cout << "\t registering matrix with pointer " << this << "\n";
+#endif
+ if( nrows( matrix ) > 0 && ncols( matrix ) > 0 ) {
+ internal::hyperdags::generator.addContainer( getID( matrix ) );
+ }
+ }
+
+
+ public:
+
+ /** \internal Base constructor, no capacity */
+ Matrix( const size_t rows, const size_t columns ) :
+ matrix( rows, columns )
+ {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) constructor\n";
+#endif
+ register_matrix();
+ }
+
+ /** \internal Base constructor with capacity */
+ Matrix( const size_t rows, const size_t columns, const size_t nz ) :
+ matrix( rows, columns, nz )
+ {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) capacity constructor\n";
+#endif
+ register_matrix();
+ }
+
+ /** \internal Copy constructor */
+ Matrix( const SelfType &x ) : matrix( x.matrix ) {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) copy constructor\n";
+#endif
+ register_matrix();
+ }
+
+ /** \internal Move constructor */
+ Matrix( SelfType &&x ) {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) move constructor\n";
+#endif
+ matrix = std::move( x.matrix );
+ register_matrix();
+ }
+
+ ~Matrix() {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) destructor\n";
+#endif
+ }
+
+ /** \internal Copy-assignment */
+ SelfType& operator=( const SelfType &x ) {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) copy assignment\n";
+#endif
+ matrix = x.matrix;
+ return *this;
+ }
+
+ /** \internal Move-assignment */
+ SelfType& operator=( SelfType &&x ) {
+#ifdef _DEBUG
+ std::cout << "Matrix (hyperdags) move assignment\n";
+#endif
+ matrix = std::move( x.matrix );
+ return *this;
+ }
+
+ /** \internal Start const-iterator */
+ template<
+ class ActiveDistribution = internal::Distribution<
+ _GRB_WITH_HYPERDAGS_USING
+ >
+ >
+ typename internal::Compressed_Storage<
+ T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+ >::template ConstIterator< ActiveDistribution > begin(
+ const IOMode mode = PARALLEL, const size_t s = 0, const size_t P = 1
+ ) const {
+ return matrix.begin( mode, s, P );
+ }
+
+ /** \internal Matching end-iterator to begin */
+ template<
+ class ActiveDistribution = internal::Distribution<
+ _GRB_WITH_HYPERDAGS_USING
+ >
+ >
+ typename internal::Compressed_Storage<
+ T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+ >::template ConstIterator< ActiveDistribution > end(
+ const IOMode mode = PARALLEL, const size_t s = 0, const size_t P = 1
+ ) const {
+ return matrix.end(mode, s, P);
+ }
+
+ /** \internal Start const-iterator */
+ template<
+ class ActiveDistribution = internal::Distribution<
+ _GRB_WITH_HYPERDAGS_USING
+ >
+ >
+ typename internal::Compressed_Storage<
+ T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+ >::template ConstIterator< ActiveDistribution > cbegin(
+ const IOMode mode = PARALLEL
+ ) const {
+ return matrix.cbegin(mode);
+ }
+
+ /** \internal Matching end iterator to cbegin */
+ template<
+ class ActiveDistribution = internal::Distribution<
+ _GRB_WITH_HYPERDAGS_USING
+ >
+ >
+ typename internal::Compressed_Storage<
+ T, grb::config::RowIndexType, grb::config::NonzeroIndexType
+ >::template ConstIterator< ActiveDistribution > cend(
+ const IOMode mode = PARALLEL
+ ) const {
+ return matrix.cend(mode);
+ }
+
+ };
+
+ /** \internal Basic type trait for matrices */
+ template< typename D, typename RIT, typename CIT, typename NIT >
+ struct is_container< Matrix< D, hyperdags, RIT, CIT, NIT > > {
+ /** A hyperdags matrix is an ALP container. */
+ static const constexpr bool value = true;
+ };
+
+ namespace internal {
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+ Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+ ) {
+ return x.matrix;
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ const Matrix< T, _GRB_WITH_HYPERDAGS_USING, RIT, CIT, NIT > & getMatrix(
+ const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &x
+ ) {
+ return x.matrix;
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline internal::Compressed_Storage<
+ T, RIT, NIT
+ > & getCRS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+ return getCRS( internal::getMatrix( A ) );
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline const internal::Compressed_Storage<
+ T, RIT, NIT
+ > & getCRS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+ return getCRS( internal::getMatrix(A) );
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline internal::Compressed_Storage<
+ T, CIT, NIT
+ > & getCCS( Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+ return getCCS( internal::getMatrix(A) );
+ }
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ inline const internal::Compressed_Storage<
+ T, CIT, NIT
+ > & getCCS( const Matrix< T, grb::hyperdags, RIT, CIT, NIT > &A ) noexcept {
+ return getCCS( internal::getMatrix(A) );
+ }
+
+ } // end ``grb::internal''
+
+}
+
+#endif // end ``_H_GRB_HYPERDAGS_MATRIX''
+
diff --git a/include/graphblas/hyperdags/pinnedvector.hpp b/include/graphblas/hyperdags/pinnedvector.hpp
new file mode 100644
index 000000000..184a4987e
--- /dev/null
+++ b/include/graphblas/hyperdags/pinnedvector.hpp
@@ -0,0 +1,103 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Contains the hyperdags implementations for the PinnedVector class
+ *
+ * @author A. Karanasiou
+ * @date August 17, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_PINNEDVECTOR
+#define _H_GRB_HYPERDAGS_PINNEDVECTOR
+
+#include
+#include
+
+#include "vector.hpp"
+
+
+namespace grb {
+
+ /** \internal No implementation notes. */
+ template< typename IOType >
+ class PinnedVector< IOType, hyperdags > {
+
+ private:
+
+ /** This implementation relies on the sub-backend. */
+ typedef PinnedVector< IOType, grb::_GRB_WITH_HYPERDAGS_USING >
+ MyPinnedVector;
+
+ /** Instance of the underlying backend. */
+ MyPinnedVector pinned_vector;
+
+
+ public:
+
+ /** \internal No implementation notes. */
+ PinnedVector() : pinned_vector() {}
+
+ /** \internal No implementation notes. */
+ PinnedVector(
+ const Vector< IOType, hyperdags, internal::hyperdags::Coordinates > &x,
+ const IOMode mode
+ ): pinned_vector( internal::getVector(x), mode ) {};
+
+ // default destructor is allowed
+
+ /** \internal No implementation notes. */
+ inline size_t size() const noexcept {
+ return pinned_vector.size();
+ }
+
+ /** \internal No implementation notes. */
+ inline size_t nonzeroes() const noexcept {
+ return pinned_vector.nonzeroes();
+ }
+
+ /** \internal No implementation notes. */
+ template< typename OutputType = IOType >
+ inline OutputType getNonzeroValue(
+ const size_t k,
+ const OutputType one
+ ) const noexcept {
+ return pinned_vector.getNonzeroValue( k, one );
+ }
+
+ /** \internal No implementation notes. */
+ inline IOType getNonzeroValue(
+ const size_t k
+ ) const noexcept {
+ return pinned_vector.getNonzeroValue( k );
+ }
+
+ /** \internal No implementation notes. */
+ inline size_t getNonzeroIndex(
+ const size_t k
+ ) const noexcept {
+ return pinned_vector.getNonzeroIndex( k );
+ }
+
+ };
+
+} // namespace grb
+
+#endif // end ``_H_GRB_HYPERDAGS_PINNEDVECTOR''
+
diff --git a/include/graphblas/hyperdags/properties.hpp b/include/graphblas/hyperdags/properties.hpp
new file mode 100644
index 000000000..ce5f239d2
--- /dev/null
+++ b/include/graphblas/hyperdags/properties.hpp
@@ -0,0 +1,56 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collects the hyperdags backend properties
+ *
+ * @author A. N. Yzelman
+ * @date 23rd of March, 2023
+ */
+
+#ifndef _H_GRB_HYPERDAGS_PROPERTIES
+#define _H_GRB_HYPERDAGS_PROPERTIES
+
+#include
+#include
+
+
+namespace grb {
+
+ /** All properties are inherited from the underlying backend. */
+ template<>
+ class Properties< hyperdags > {
+
+ public:
+
+ static constexpr const bool writableCaptured =
+ Properties< _GRB_WITH_HYPERDAGS_USING >::writableCaptured;
+
+ static constexpr const bool isBlockingExecution =
+ Properties< _GRB_WITH_HYPERDAGS_USING >::isBlockingExecution;
+
+ static constexpr const bool isNonblockingExecution =
+ Properties< _GRB_WITH_HYPERDAGS_USING >::isNonblockingExecution;
+
+ };
+
+} // namespace grb
+
+#endif // end `_H_GRB_HYPERDAGS_PROPERTIES
+
diff --git a/include/graphblas/hyperdags/spmd.hpp b/include/graphblas/hyperdags/spmd.hpp
new file mode 100644
index 000000000..270d7967c
--- /dev/null
+++ b/include/graphblas/hyperdags/spmd.hpp
@@ -0,0 +1,59 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the SPMD API for the HyperDAGs backend
+ *
+ * @author A. Karanasiou
+ * @date 15th of March 2022
+ */
+
+#include //size_t
+
+#include
+
+namespace grb {
+
+ template<>
+ class spmd< hyperdags > {
+
+ public:
+
+ static inline size_t nprocs() noexcept {
+ return spmd< _GRB_WITH_HYPERDAGS_USING >::nprocs();
+ }
+
+ static inline size_t pid() noexcept {
+ return spmd< _GRB_WITH_HYPERDAGS_USING >::pid();
+ }
+
+ static RC sync(
+ const size_t msgs_in = 0, const size_t msgs_out = 0
+ ) noexcept {
+ return spmd< _GRB_WITH_HYPERDAGS_USING >::sync( msgs_in, msgs_out );
+ }
+
+ static RC barrier() noexcept {
+ return spmd< _GRB_WITH_HYPERDAGS_USING >::barrier();
+ }
+
+ }; // end class ``spmd'' reference implementation
+
+} // namespace grb
+
diff --git a/include/graphblas/hyperdags/vector.hpp b/include/graphblas/hyperdags/vector.hpp
new file mode 100644
index 000000000..5f422399e
--- /dev/null
+++ b/include/graphblas/hyperdags/vector.hpp
@@ -0,0 +1,284 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the vector container for the HyperDAGs backend
+ *
+ * @author A. N. Yzelman
+ * @date 31st of January, 2022
+ */
+
+#ifndef _H_GRB_HYPERDAGS_VECTOR
+#define _H_GRB_HYPERDAGS_VECTOR
+
+#include
+#include
+
+
+namespace grb {
+
+ template< typename T, typename RIT, typename CIT, typename NIT >
+ class Matrix< T, hyperdags, RIT, CIT, NIT >;
+
+ namespace internal {
+
+ namespace hyperdags {
+ typedef grb::internal::Coordinates<
+ grb::config::IMPLEMENTATION< grb::hyperdags >::coordinatesBackend()
+ > Coordinates;
+ }
+
+ template< typename T >
+ Vector< T, _GRB_WITH_HYPERDAGS_USING, typename hyperdags::Coordinates > &
+ getVector(
+ Vector< T, grb::hyperdags, typename hyperdags::Coordinates > &
+ );
+
+ template< typename T >
+ const Vector< T, _GRB_WITH_HYPERDAGS_USING, typename hyperdags::Coordinates > &
+ getVector(
+ const Vector< T, grb::hyperdags, typename hyperdags::Coordinates > &x
+ );
+
+ template< typename T>
+ inline const T * getRaw(
+ const Vector<
+ T, grb::hyperdags,
+ typename internal::hyperdags::Coordinates
+ > &x
+ );
+
+ template< typename T>
+ inline T * getRaw(
+ Vector< T, grb::hyperdags, typename internal::hyperdags::Coordinates > &x
+ );
+
+ }
+
+ template< typename T >
+ class Vector< T, hyperdags, internal::hyperdags::Coordinates > {
+
+ template< typename A >
+ friend Vector<
+ A, _GRB_WITH_HYPERDAGS_USING,
+ internal::hyperdags::Coordinates
+ > & internal::getVector(
+ Vector< A, grb::hyperdags, internal::hyperdags::Coordinates > &
+ );
+
+ template< typename A >
+ friend const Vector<
+ A, _GRB_WITH_HYPERDAGS_USING,
+ internal::hyperdags::Coordinates
+ > & internal::getVector(
+ const Vector< A, grb::hyperdags, internal::hyperdags::Coordinates > &
+ );
+
+ friend class PinnedVector< T, hyperdags >;
+
+
+ private:
+
+ /** \internal My own type */
+ typedef Vector< T, hyperdags, internal::hyperdags::Coordinates > SelfType;
+
+ /** \internal Simply use an underlying implementation */
+ typedef Vector<
+ T, grb::_GRB_WITH_HYPERDAGS_USING,
+ internal::hyperdags::Coordinates
+ > MyVectorType;
+
+ /** \internal Iterator type inherited from underlying backend */
+ template< Backend A >
+ using ConstIterator = typename MyVectorType::template ConstIterator< A >;
+
+ /** \internal Simply wrap around underlying backend */
+ MyVectorType vector;
+
+ /** \internal Registers this vector as a source container */
+ void register_vector() {
+#ifdef _DEBUG
+ std::cout << "\t registering vector with pointer " << this << "\n";
+#endif
+ if( size( vector ) > 0 ) {
+ internal::hyperdags::generator.addContainer( getID( vector ) );
+ }
+ }
+
+
+ public:
+
+ typedef typename MyVectorType::const_iterator const_iterator;
+
+ Vector( const size_t n ) : vector( n ) {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) constructor\n";
+#endif
+ register_vector();
+ }
+
+ Vector() : Vector( 0 ) {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) default constructor\n";
+#endif
+ }
+
+ Vector( const SelfType &x ) : vector( x.vector ) {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) copy constructor\n";
+#endif
+ register_vector();
+ }
+
+ Vector( SelfType &&x ) noexcept {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) move constructor\n";
+#endif
+ vector = std::move( x.vector );
+ register_vector();
+ }
+
+ Vector( const size_t n, const size_t nz ) : vector( n, nz ) {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) capacity constructor\n";
+#endif
+ register_vector();
+ }
+
+ ~Vector() {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) destructor\n";
+#endif
+ }
+
+ SelfType & operator=( const SelfType &x ) {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) copy assignment\n";
+#endif
+ vector = x.vector;
+ return *this;
+ }
+
+ SelfType & operator=( SelfType &&x ) noexcept {
+#ifdef _DEBUG
+ std::cout << "Vector (hyperdags) move assignment\n";
+#endif
+ vector = std::move( x.vector );
+ return *this;
+ }
+
+ template< Backend spmd_backend = reference >
+ ConstIterator< spmd_backend > cbegin(
+ const size_t s = 0, const size_t P = 1
+ ) const {
+ return vector.cbegin( s, P );
+ }
+
+ template< Backend spmd_backend = reference >
+ ConstIterator< spmd_backend > cend(
+ const size_t s = 0, const size_t P = 1
+ ) const {
+ return vector.cend( s, P );
+ }
+
+ template< Backend spmd_backend = reference >
+ ConstIterator< spmd_backend > begin(
+ const size_t s = 0, const size_t P = 1
+ ) const {
+ return vector.begin( s, P );
+ }
+
+ template< Backend spmd_backend = reference >
+ ConstIterator< spmd_backend > end(
+ const size_t s = 0, const size_t P = 1
+ ) const {
+ return vector.end( s, P );
+ }
+
+ T & operator[]( const size_t i ) {
+ return vector[ i ];
+ }
+
+ T & operator[]( const size_t i ) const {
+ return vector[ i ];
+ }
+ /**
+ * Non-standard data accessor for debug purposes.
+ *
+ * \warning Do not use this fucntion.
+ *
+ * The user promises to never write to this data when GraphBLAS can operate
+ * on it. The user understands that data read out may be subject to incoming
+ * changes caused by preceding GraphBLAS calls.
+ *
+ * \warning This function is only defined for the reference and hyperdags backends--
+ * thus switching backends may cause your code to not compile.
+ *
+ * @return A const reference to the raw data this vector contains.
+ *
+ * \note This function is used internally for testing purposes.
+ */
+ T * raw() const {
+ return vector.raw();
+ }
+
+ };
+
+ namespace internal {
+
+ template< typename T >
+ Vector<
+ T, _GRB_WITH_HYPERDAGS_USING,
+ internal::hyperdags::Coordinates
+ > & getVector(
+ Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+ ) {
+ return x.vector;
+ }
+
+ template< typename T >
+ const Vector<
+ T, _GRB_WITH_HYPERDAGS_USING,
+ internal::hyperdags::Coordinates
+ > & getVector(
+ const Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+ ) {
+ return x.vector;
+ }
+
+ template< typename T>
+ inline const T * getRaw(
+ const Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+ ) {
+ return getRaw(getVector(x));
+ };
+
+ template< typename T>
+ inline T * getRaw(
+ Vector< T, grb::hyperdags, internal::hyperdags::Coordinates > &x
+ ) {
+ return getRaw(getVector(x));
+ };
+
+ }
+
+}
+
+#endif
+
diff --git a/include/graphblas/identities.hpp b/include/graphblas/identities.hpp
index dd48fcf98..fdbb7c7f7 100644
--- a/include/graphblas/identities.hpp
+++ b/include/graphblas/identities.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Provides a set of standard identities for use with ALP.
+ *
* @author A. N. Yzelman
* @date 11th of August, 2016
*/
@@ -25,6 +29,7 @@
#include
+
namespace grb {
/**
@@ -195,3 +200,4 @@ namespace grb {
} // namespace grb
#endif
+
diff --git a/include/graphblas/init.hpp b/include/graphblas/init.hpp
index 2b1af0a52..dd34749ba 100644
--- a/include/graphblas/init.hpp
+++ b/include/graphblas/init.hpp
@@ -26,11 +26,16 @@
#include "backends.hpp"
#include "base/init.hpp"
-
// include all implementations
#ifdef _GRB_WITH_REFERENCE
#include "graphblas/reference/init.hpp"
#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include "graphblas/hyperdags/init.hpp"
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/init.hpp"
+#endif
#ifdef _GRB_WITH_LPF
#include "graphblas/bsp1d/init.hpp"
#endif
diff --git a/include/graphblas/interfaces/pregel.hpp b/include/graphblas/interfaces/pregel.hpp
new file mode 100644
index 000000000..3350b1e0e
--- /dev/null
+++ b/include/graphblas/interfaces/pregel.hpp
@@ -0,0 +1,960 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * This file defines a vertex-centric programming API called ALP/Pregel, which
+ * automatically translates to standard ALP/GraphBLAS primitives.
+ *
+ * @author A. N. Yzelman
+ * @date 2022
+ *
+ * \defgroup Pregel ALP/Pregel
+ * @{
+ *
+ * @brief ALP/Pregel enables vertex-centric programming.
+ *
+ * \par API introduction
+ *
+ * With vertex-centric programming, graph algorithms are written from the
+ * perspective of a vertex within an input graph. Each vertex executes a program
+ * on a round-by-round basis, while between rounds all vertex programs pass
+ * messages to neighbour vertices using the edges of the input graph. Edges may
+ * be directed or undirected; in the former, messages travel from the source
+ * vertex to the destination vertex only. Each vertex program sends the same
+ * message to all of its neighbours -- i.e., it broadcasts a single given
+ * message. In ALP/Pregel, incoming messages are furthermore \em accumulated
+ * using a #grb::Monoid. The accumulation of incoming messages is typically used
+ * by the vertex-centric program during the next round it executes.
+ *
+ * Pregel programs thus execute on a given graph, and hence constructing a
+ * #grb::interfaces::Pregel instance requires passing input iterators
+ * corresponding to the graph on which ALP/Pregel programs are executed. Such an
+ * instance logically corresponds to an execution engine of vertex-centric
+ * programs for a specific graph . Multiple #grb::interfaces::Pregel
+ * instances, each potentially built using a different input graph, may exist
+ * simultaneously.
+ *
+ * ALP/Pregel programs then are executed using #grb::interfaces::Pregel::execute.
+ * The first template argument to this function is the binary operator of the
+ * monoid to be used for accumalating incoming messages, while the second
+ * template argument corresponds to its identity-- see #grb::operators and
+ * #grb::identities for example operator and identities. The remainder template
+ * arguments to #grb::interfaces::Pregel::execute are automatically inferred.
+ *
+ * The first non-template argument is the vertex-centric program, for example,
+ * #grb::algorithms::pregel::ConnectedComponents-- a vertex-centric program in
+ * ALP/GraphBLAS hence is a class where the program is given as a public static
+ * function named \em program. This function takes five arguments:
+ * 1. the current state of the vertex (read-write),
+ * 2. the incoming message (after accumulation, read only),
+ * 3. the outgoing message (read-write),
+ * 4. the global program parameters (read only), and
+ * 5. the Pregel interface state (read only and read-write).
+ *
+ * The types of arguments 1-4 are defined by the program, but must be plain old
+ * data (POD) types-- similar to the requirements of an ALP operator. An example
+ * of an ALP/Pregel algorithm that has non-trivial algorithm parameters is
+ * #grb::algorithms::pregel::PageRank: #grb::algorithms::pregel::PageRank::Data.
+ *
+ * The type of the 5th argument to #grb::interfaces::Pregel::execute is an
+ * instance of #grb::interfaces::PregelState. Some of the ALP/Pregel state
+ * fields are read-only, such as the current round number
+ * #grb::interfaces::PregelState::round, while others are read-write.
+ * Please see the corresponding documentation for what read-only states may be
+ * inspected during program execution. Some fields are global (such as again the
+ * current round number), while others are specific to the vertex a program is
+ * running on (such as #grb::interfaces::PregelState::indegree).
+ *
+ * Read-write ALP/Pregel state is used for determining termination conditions.
+ * There are two associated flags:
+ * 1. #grb::interfaces::PregelState::active, and
+ * 2. #grb::interfaces::PregelState::voteToHalt.
+ *
+ * Each vertex has its own state of these two flags, with the defaults being
+ * true for the former and false for the latter.
+ *
+ * If, by the end of any round, a vertex sets its active flag to
+ * false , that vertex will not participate in any future rounds. For
+ * any neighbouring vertices it shall be as though the inactive vertex keeps
+ * broadcasting the identity of the given accumulation monoid.
+ *
+ * If at the end of any round all vertices are inactive, the program terminates.
+ * Similarly, if by the end of a round \em all vertices have the
+ * voteToHalt flag set to true , then that Pregel program
+ * terminates as well.
+ *
+ * \par Using vertex-centric algorithms
+ *
+ * By convention, ALP/Pregel algorithms allow for a simplified way of executing
+ * them that does not require the Pregel algorithm user to pass the right monoid
+ * to #grb::interfaces::Pregel::execute each time they call one, such as, for
+ * example,
+ * - #grb::algorithms::pregel::ConnectedComponents::execute, or
+ * - #grb::algorithms::pregel::PageRank::execute.
+ *
+ * These functions only take the Pregel instance that is to execute the Pregel
+ * program, as well as a vector of initial states as mandatory input. As usual,
+ * optional parameters indicate the maximum number of rounds allotted to the
+ * program (zero for unbounded), and where to write back the number of rounds
+ * after which the program has terminated (NULL for no write back).
+ *
+ * All pre-defined ALP/Pregel algorithms reside in the #grb::algorithms::pregel
+ * namespace.
+ *
+ * \par Configuration settings
+ *
+ * The ALP/Pregel run-time system manages state for every vertex in the
+ * underlying graph. The execution time of a single round is always proportional
+ * to the number of active vertices. Since inactive vertices stay inactive in
+ * subsequent rounds, their state could be erased. This has two \em potential
+ * benefits:
+ * 1. it \em may (depending on the used backend's performance semantics) reduce
+ * memory use; and/or
+ * 2. it \em may result in faster execution (depending on the used backend's
+ * performance semantics).
+ *
+ * We may opt to always attempt to sparsify state, use some heuristic to
+ * determine when to sparsify, or just simply never attempt such sparsification.
+ *
+ * This choice is configurable via #grb::interfaces::config::out_sparsify; see
+ * #grb::interfaces::config::SparsificationStrategy for options and more
+ * details.
+ *
+ * @}
+ */
+
+#ifndef _H_GRB_INTERFACES_PREGEL
+#define _H_GRB_INTERFACES_PREGEL
+
+#include
+#include
+
+#include // std::runtime_error
+
+
+namespace grb {
+
+ namespace interfaces {
+
+ /**
+ * Contains configurations for programming models that are simulated on top of
+ * ALP/GraphBLAS.
+ */
+ namespace config {
+
+ /**
+ * The set of sparsification strategies supported by the ALP/Pregel
+ * interface.
+ *
+ * \ingroup Pregel
+ */
+ enum SparsificationStrategy {
+
+ /**
+ * No sparsification of internal and user-defined vertex states, beyond that
+ * which is necessary to bound the run-time by the number of active
+ * vertices.
+ */
+ NONE = 0,
+
+ /**
+ * Always applies the sparsification procedure on both internal and user-
+ * defined vertex states.
+ *
+ * Does not consider whether the resulting operation would reduce the number
+ * of vertex entries.
+ *
+ * This variant was tested against #NONE for #out_sparsify, and found to be
+ * slower always.
+ *
+ * \internal This strategy necessarily always applied on the
+ * #Pregel::ActiveVertices vector.
+ */
+ ALWAYS,
+
+ /**
+ * Sparsify only when the resulting vector would indeed be sparser.
+ *
+ * While this sounds like it should be a minimal condition to check for
+ * before applying sparsification, this check itself comes at non-trivial
+ * overhead for any backend. The performance of this strategy versus
+ * #ALWAYS hence is a trade-off, one that varies with underlying graphs
+ * as well as with the vertex-centric program chosen.
+ *
+ * \internal
+ * \note This strategy should \em not be applied to #Pregel::ActiveVertices
+ * since doing so requires computing the number of active vertices,
+ * which has the same complexity as actually sparsifying that vector.
+ *
+ * \todo This variant has never been exhaustively tested for
+ * \a out_sparsify.
+ * \endinternal
+ */
+ WHEN_REDUCED,
+
+ /**
+ * Sparsify only when the resulting vector would have half (or less) its
+ * current number of nonzeroes. This is a simple heuristic that balances
+ * the trade-off of \em applying sparsification by amortising its overhead.
+ * The overhead described at #WHEN_REDUCED corresponding to determining the
+ * gain of sparsification, however, remains the same.
+ *
+ * \internal
+ * \note This strategy should \em not be applied to #Pregel::ActiveVertices
+ * since doing so requires computing the number of active vertices,
+ * which has the same complexity as actually sparsifying that vector.
+ *
+ * \todo This variant has never been exhaustively tested for
+ * \a out_sparsify.
+ * \endinternal
+ */
+ WHEN_HALVED
+
+ };
+
+ /**
+ * What sparsification strategy should be applied to the outgoing
+ * messages.
+ *
+ * \internal
+ * Only #NONE and #ALWAYS have been tested, with #NONE being faster on all
+ * test cases.
+ * \endinternal
+ *
+ * \ingroup Pregel
+ */
+ constexpr const SparsificationStrategy out_sparsify = NONE;
+
+ } // end namespace grb::interfaces::config
+
+ /**
+ * The state of the vertex-center Pregel program that the user may interface
+ * with.
+ *
+ * The state includes global data as well as vertex-centric state. The global
+ * state is umodifiable and includes:
+ * - #grb::interfaces::PregelState::num_vertices,
+ * - #grb::interfaces::PregelState::num_edges, and
+ * - #grb::interfaces::PregelState::round.
+ *
+ * Vertex-centric state can be either constant or modiable:
+ * - static vertex-centric state: #grb::interfaces::PregelState::indegree,
+ * #grb::interfaces::PregelState::outdegree, and
+ * #grb::interfaces::PregelState::vertexID.
+ * - modifiable vertex-centric state:
+ * #grb::interfaces::PregelState::voteToHalt, and
+ * #grb::interfaces::PregelState::active.
+ *
+ * \ingroup Pregel
+ */
+ struct PregelState {
+
+ /**
+ * Represents whether the current vertex is active.
+ *
+ * Since this struct is only to-be used within the computational phase of a
+ * vertex-centric program, this always reads true on the start of a
+ * round.
+ *
+ * The program may set this field to false which will cause this
+ * vertex to no longer trigger computational steps during subsequent rounds.
+ *
+ * An inactive vertex will no longer broadcast messages.
+ *
+ * If all vertices are inactive the program terminates.
+ */
+ bool &active;
+
+ /**
+ * Represents whether this (active) vertex votes to terminate the program.
+ *
+ * On start of a round, this entry is set to false . If all active
+ * vertices set this to true , the program will terminate after the
+ * current round.
+ */
+ bool &voteToHalt;
+
+ /**
+ * The number of vertices in the global graph.
+ */
+ const size_t &num_vertices;
+
+ /**
+ * The number of edges in the global graph.
+ */
+ const size_t &num_edges;
+
+ /**
+ * The out-degree of this vertex.
+ */
+ const size_t &outdegree;
+
+ /**
+ * The in-degree of this vertex.
+ */
+ const size_t &indegree;
+
+ /**
+ * The current round the vertex-centric program is currently executing.
+ */
+ const size_t &round;
+
+ /**
+ * A unique ID of this vertex.
+ *
+ * This number is an unsigned integer between 0 (inclusive) and
+ * the number of vertices the underlying graph holds (exclusive).
+ */
+ const size_t &vertexID;
+
+ };
+
+ /**
+ * A Pregel run-time instance.
+ *
+ * Pregel wraps around graph data and executes computations on said graph. A
+ * runtime thus is constructed from graph, and enables running any Pregel
+ * algorithm on said graph.
+ *
+ * \ingroup Pregel
+ */
+ template<
+ typename MatrixEntryType
+ >
+ class Pregel {
+
+ private:
+
+ /** \internal The number of vertices of the underlying #graph. */
+ const size_t n;
+
+ /** \internal The number of edges of the underlying #graph. */
+ size_t nz;
+
+ /** \internal The graph to run vertex-centric programs over. */
+ grb::Matrix< MatrixEntryType > graph;
+
+ /** \internal Which vertices are still active. */
+ grb::Vector< bool > activeVertices;
+
+ /** \internal Which vertices voted to halt. */
+ grb::Vector< bool > haltVotes;
+
+ /** \internal A buffer used to sparsify #activeVertices. */
+ grb::Vector< bool > buffer;
+
+ /** \internal Pre-computed outdegrees. */
+ grb::Vector< size_t > outdegrees;
+
+ /** \internal Pre-cominputed indegrees. */
+ grb::Vector< size_t > indegrees;
+
+ /** \internal Global vertex IDs. */
+ grb::Vector< size_t > IDs;
+
+ /**
+ * \internal
+ * Initialises the following fields:
+ * -# outdegrees
+ * -# indegrees
+ * -# IDs
+ * Other fields are set on program start.
+ * \endinternal
+ */
+ void initialize() {
+ grb::Semiring<
+ grb::operators::add< size_t >,
+ grb::operators::right_assign_if< bool, size_t, size_t >,
+ grb::identities::zero,
+ grb::identities::logical_true
+ > ring;
+ grb::Vector< size_t > ones( n );
+ if( grb::set( ones, 1 ) != SUCCESS ) {
+ throw std::runtime_error( "Could not set vector ones" );
+ }
+ if( grb::set( outdegrees, 0 ) != SUCCESS ) {
+ throw std::runtime_error( "Could not initialise outdegrees" );
+ }
+ if( grb::mxv< grb::descriptors::dense >(
+ outdegrees, graph, ones, ring
+ ) != SUCCESS
+ ) {
+ throw std::runtime_error( "Could not compute outdegrees" );
+ }
+ if( grb::set( indegrees, 0 ) != SUCCESS ) {
+ throw std::runtime_error( "Could not initialise indegrees" );
+ }
+ if( grb::mxv<
+ grb::descriptors::dense | grb::descriptors::transpose_matrix
+ >(
+ indegrees, graph, ones, ring
+ ) != SUCCESS ) {
+ throw std::runtime_error( "Could not compute indegrees" );
+ }
+ if( grb::set< grb::descriptors::use_index >(
+ IDs, 0
+ ) != SUCCESS
+ ) {
+ throw std::runtime_error( "Could not compute vertex IDs" );
+ }
+ }
+
+
+ protected:
+
+ /**
+ * \internal
+ * Internal constructor for the cases where the number of vertix IDs,
+ * \a _n, is already known.
+ * \endinternal
+ */
+ template< typename IType >
+ Pregel(
+ const size_t _n,
+ IType _start, const IType _end,
+ const grb::IOMode _mode
+ ) :
+ n( _n ),
+ graph( _n, _n ),
+ activeVertices( _n ),
+ haltVotes( _n ),
+ buffer( _n ),
+ outdegrees( _n ),
+ indegrees( _n ),
+ IDs( _n )
+ {
+ if( grb::ncols( graph ) != grb::nrows( graph ) ) {
+ throw std::runtime_error( "Input graph is bipartite" );
+ }
+ if( grb::buildMatrixUnique(
+ graph, _start, _end, _mode
+ ) != SUCCESS ) {
+ throw std::runtime_error( "Could not build graph" );
+ }
+ nz = grb::nnz( graph );
+ initialize();
+ }
+
+
+ public:
+
+ /**
+ * Constructs a Pregel instance from input iterators over some graph.
+ *
+ * @tparam IType The type of the input iterator.
+ *
+ * @param[in] _m The maximum vertex ID for excident edges.
+ * @param[in] _n The maximum vertex ID for incident edges.
+ *
+ * \note This is equivalent to the row- and column- size of an input matrix
+ * which represents the input graph.
+ *
+ * \note If these values are not known, please scan the input iterators to
+ * derive these values prior to calling this constructor. On
+ * compelling reasons why such functionality would be useful to
+ * provide as a standard factory method, please feel welcome to submit
+ * an issue.
+ *
+ * \warning The graph is assumed to have contiguous IDs -- i.e., every
+ * vertex ID in the range of 0 (inclusive) to the maximum of \a m
+ * and \a n (exclusive) has at least one excident or at least one
+ * incident edge.
+ *
+ * @param[in] _start An iterator pointing to the start element of an
+ * a collection of edges.
+ * @param[in] _end An iterator matching \a _start in end position.
+ *
+ * All edges to be ingested thus are contained within \a _start and \a end.
+ *
+ * @param[in] _mode Whether sequential or parallel I/O is to be used.
+ *
+ * The value of \a _mode only takes effect when there are multiple user
+ * processes, such as for example when executing over a distributed-memory
+ * cluster. The choice between sequential and parallel I/O should be thus:
+ * - If the edges pointed to by \a _start and \a _end correspond to the
+ * \em entire set of edges on \em each process, then the I/O mode should
+ * be #grb::SEQUENTIAL;
+ * - If the edges pointed to by \a _start and \a _end correspond to
+ * \em different sets of edges on each different process while their
+ * union represents the graph to be ingested, then the I/O mode should be
+ * #grb::PARALLEL.
+ *
+ * On errors during ingestion, this constructor throws exceptions.
+ */
+ template< typename IType >
+ Pregel(
+ const size_t _m, const size_t _n,
+ IType _start, const IType _end,
+ const grb::IOMode _mode
+ ) : Pregel( std::max( _m, _n ), _start, _end, _mode ) {}
+
+ /**
+ * Executes a given vertex-centric \a program on this graph.
+ *
+ * The program must be a static function that returns void and takes five
+ * input arguments:
+ * - a reference to a vertex-defined state. The type of this reference may
+ * be defined by the program, but has to match the element type of
+ * \a vertex_state passed to this function.
+ * - a const-reference to an incoming message. The type of this reference
+ * may be defined by the program, but has to match the element type of
+ * \a in passed to this function. It must furthermore be compatible with
+ * the domains of \a Op (see below).
+ * - a reference to an outgoing message. The type of this reference may be
+ * defined by the program, but has to match the element type of \a out
+ * passed to this function. It must furthermore be compatible with the
+ * domains of \a Op (see below).
+ * - a const-reference to a program-defined type. The function of this
+ * argument is to collect global read-only algorithm parameters.
+ * - a reference to an instance of #grb::interfaces::PregelState. The
+ * function of this argument is two-fold: 1) make available global read-
+ * only statistics of the graph the algorithm is executing on, and to 2)
+ * control algorithm termination conditions.
+ *
+ * The program will be called during each round of a Pregel computation. The
+ * program is expected to compute something based on the incoming message
+ * and vertex-local state, and (optionally) generate an outgoing message.
+ * After each round, the outgoing message at all vertices are broadcast to
+ * all its neighbours. The Pregel runtime, again for each vertex, reduces
+ * all incoming messages into a single message, after which the next round
+ * of computation starts, after which the procedure is repeated.
+ *
+ * The program terminates in one of two ways:
+ * 1. there are no more active vertices; or
+ * 2. all active vertices vote to halt.
+ *
+ * On program start, i.e., during the first round, all vertices are active.
+ * During the computation phase, any vertex can set itself inactive for
+ * subsequent rounds by setting #grb::interfaces::PregelState::active to
+ * false . Similarly, any active vertex can vote to halt by setting
+ * #grb::interfaces::PregelState::voteToHalt to true .
+ *
+ * Reduction of incoming messages to a vertex will occur through an user-
+ * defined monoid given by:
+ *
+ * @tparam Op The binary operation of the monoid. This includes its domain.
+ * @tparam Id The identity element of the monoid.
+ *
+ * The following template arguments will be automatically inferred:
+ *
+ * @tparam Program The type of the program to-be executed.
+ * @tparam IOType The type of the state of a single vertex.
+ * @tparam GlobalProgramData The type of globally accessible read-only
+ * program data.
+ * @tparam IncomingMessageType The type of an incoming message.
+ * @tparam OutgoingMessageType The type of an outgoing message.
+ *
+ * The arguments to this function are as follows:
+ *
+ * @param[in] program The vertex-centric program to execute.
+ *
+ * The same Pregel runtime instance hence can be re-used to execute multiple
+ * algorithms on the same graph.
+ *
+ * Vertex-centric programs have both vertex-local and global state:
+ *
+ * @param[in] vertex_state A vector that contains the state of each vertex.
+ * @param[in] data Global read-only state for the given \a program.
+ *
+ * The capacity, size, and number of nonzeroes of \a vertex_state must equal
+ * the maximum vertex ID.
+ *
+ * Finally, in the ALP spirit which aims to control all relevant performance
+ * aspects, the workspace required by the Pregel runtime must be pre-
+ * allocated and passed in:
+ *
+ * @param[in] in Where incoming messages are stored. Any initial values may
+ * or may not be ignored, depending on the \a program
+ * behaviour during the first round of computation.
+ *
+ * @param[in] out Where outgoing messages are stored. Any initial values
+ * will be ignored.
+ *
+ * The capacities and sizes of \a in and \a out must equal the maximum vertex
+ * ID. For sparse vectors \a in with more than zero nonzeroes, all initial
+ * contents will be overwritten by the identity of the reduction monoid. Any
+ * initial contents for \a out will always be ignored as every round of
+ * computation starts with the outgoing message set to the monoid identity.
+ *
+ * \note Thus if the program requires some initial incoming messages to be
+ * present during the first round of computation, those may be passed
+ * as part of a dense vectors \a in.
+ *
+ * The contents of \a in and \a out after termination of a vertex-centric
+ * function are undefined, including when this function returns
+ * #grb::SUCCESS. Output of the program should be part of the vertex-centric
+ * state recorded in \a vertex_state.
+ *
+ * Some statistics are returned after a vertex-centric program terminates:
+ *
+ * @param[out] rounds The number of rounds the Pregel program has executed.
+ * The initial value to \a rounds will be ignored.
+ *
+ * The contents of this field shall be undefined when this function does not
+ * return #grb::SUCCESS.
+ *
+ * Vertex-programs execute in rounds and could, if the given program does
+ * not infer proper termination conditions, run forever. To curb the number
+ * of rounds, the following \em optional parameter may be given:
+ *
+ * @param[in] out_buffer An optional buffer area that should only be set
+ * whenever the #config::out_sparsify configuration
+ * parameter is not set to #config::NONE. If that is
+ * the case, then \a out_buffer should have size and
+ * capacity equal to the maximum vertex ID.
+ *
+ * @param[in] max_rounds The maximum number of rounds the \a program may
+ * execute. Once reached and not terminated, the
+ * program will forcibly terminate.
+ *
+ * To turn off termination after a maximum number of rounds, \a max_rounds
+ * may be set to zero. This is also the default.
+ *
+ * Executing a Pregel function returns one of the following error codes:
+ *
+ * @returns #grb::SUCCESS The \a program executed (and terminated)
+ * successfully.
+ * @returns #grb::MISMATCH At least one of \a vertex_state, \a in, or \a out
+ * is not of the required size.
+ * @returns #grb::ILLEGAL At least one of \a vertex_state, \a in, or \a out
+ * does not have the required capacity.
+ * @returns #grb::ILLEGAL If \a vertex_state is not dense.
+ * @returns #grb::PANIC In case an unrecoverable error was encountered
+ * during execution.
+ */
+ template<
+ class Op,
+ template< typename > class Id,
+ class Program,
+ typename IOType,
+ typename GlobalProgramData,
+ typename IncomingMessageType,
+ typename OutgoingMessageType
+ >
+ grb::RC execute(
+ const Program program,
+ grb::Vector< IOType > &vertex_state,
+ const GlobalProgramData &data,
+ grb::Vector< IncomingMessageType > &in,
+ grb::Vector< OutgoingMessageType > &out,
+ size_t &rounds,
+ grb::Vector< OutgoingMessageType > &out_buffer =
+ grb::Vector< OutgoingMessageType >(0),
+ const size_t max_rounds = 0
+ ) {
+ static_assert( grb::is_operator< Op >::value &&
+ grb::is_associative< Op >::value,
+ "The combiner must be an associate operator"
+ );
+ static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+ "The combiner left-hand input domain should match the incoming message "
+ "type." );
+ static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+ "The combiner right-hand input domain should match the incoming message "
+ "type." );
+ static_assert( std::is_same< typename Op::D1, IncomingMessageType >::value,
+ "The combiner output domain should match the incoming message type." );
+
+ // set default output
+ rounds = 0;
+
+ // sanity checks
+ if( grb::size(vertex_state) != n ) {
+ return MISMATCH;
+ }
+ if( grb::size(in) != n ) {
+ return MISMATCH;
+ }
+ if( grb::size(out) != n ) {
+ return MISMATCH;
+ }
+ if( grb::capacity(vertex_state) != n ) {
+ return ILLEGAL;
+ }
+ if( grb::capacity(in) != n ) {
+ return ILLEGAL;
+ }
+ if( grb::capacity(out) != n ) {
+ return ILLEGAL;
+ }
+ if( config::out_sparsify && grb::capacity(out_buffer) != n ) {
+ return ILLEGAL;
+ }
+ if( grb::nnz(vertex_state) != n ) {
+ return ILLEGAL;
+ }
+
+ // define some monoids and semirings
+ grb::Monoid<
+ grb::operators::logical_or< bool >,
+ grb::identities::logical_false
+ > orMonoid;
+
+ grb::Monoid<
+ grb::operators::logical_and< bool >,
+ grb::identities::logical_true
+ > andMonoid;
+
+ grb::Semiring<
+ Op,
+ grb::operators::left_assign_if<
+ IncomingMessageType, bool, IncomingMessageType
+ >,
+ Id,
+ grb::identities::logical_true
+ > ring;
+
+ // set initial round ID
+ size_t step = 0;
+
+ // activate all vertices
+ grb::RC ret = grb::set( activeVertices, true );
+
+ // initialise halt votes to all-false
+ if( ret == SUCCESS ) {
+ ret = grb::set( haltVotes, false );
+ }
+
+ // set default incoming message
+ if( ret == SUCCESS && grb::nnz(in) < n ) {
+#ifdef _DEBUG
+ if( grb::nnz(in) > 0 ) {
+ std::cerr << "Overwriting initial incoming messages since it was not a "
+ << "dense vector\n";
+ }
+#endif
+ ret = grb::set( in, Id< IncomingMessageType >::value() );
+ }
+
+ // reset outgoing buffer
+ size_t out_nnz = n;
+ if( ret == SUCCESS ) {
+ ret = grb::set( out, Id< OutgoingMessageType >::value() );
+ }
+
+ // return if initialisation failed
+ if( ret != SUCCESS ) {
+ assert( ret == FAILED );
+ std::cerr << "Error: initialisation failed, but if workspace holds full "
+ << "capacity, initialisation should never fail. Please submit a bug "
+ << "report.\n";
+ return PANIC;
+ }
+
+ // while there are active vertices, execute
+ while( ret == SUCCESS ) {
+
+ assert( max_rounds == 0 || step < max_rounds );
+ // run one step of the program
+ ret = grb::eWiseLambda(
+ [
+ this,
+ &vertex_state,
+ &in,
+ &out,
+ &program,
+ step,
+ &data
+ ]( const size_t i ) {
+ // create Pregel struct
+ PregelState pregel = {
+ activeVertices[ i ],
+ haltVotes[ i ],
+ n,
+ nz,
+ outdegrees[ i ],
+ indegrees[ i ],
+ step,
+ IDs[ i ]
+ };
+ // only execute program on active vertices
+ assert( activeVertices[ i ] );
+#ifdef _DEBUG
+ std::cout << "Vertex " << i << " remains active in step " << step
+ << "\n";
+#endif
+ program(
+ vertex_state[ i ],
+ in[ i ],
+ out[ i ],
+ data,
+ pregel
+ );
+#ifdef _DEBUG
+ std::cout << "Vertex " << i << " sends out message " << out[ i ]
+ << "\n";
+#endif
+ }, activeVertices, vertex_state, in, out, outdegrees, haltVotes, indegrees, IDs
+ );
+
+ // increment counter
+ (void) ++step;
+
+ // check if everyone voted to halt
+ if( ret == SUCCESS ) {
+ bool halt = true;
+ ret = grb::foldl< grb::descriptors::structural >(
+ halt, haltVotes, activeVertices, andMonoid
+ );
+ assert( ret == SUCCESS );
+ if( ret == SUCCESS && halt ) {
+#ifdef _DEBUG
+ std::cout << "\t All active vertices voted to halt; "
+ << "terminating Pregel program.\n";
+#endif
+ break;
+ }
+ }
+
+ // update active vertices
+ if( ret == SUCCESS ) {
+#ifdef _DEBUG
+ std::cout << "\t Number of active vertices was "
+ << grb::nnz( activeVertices ) << ", and ";
+#endif
+ ret = grb::clear( buffer );
+ ret = ret ? ret : grb::set( buffer, activeVertices, true );
+ std::swap( buffer, activeVertices );
+#ifdef _DEBUG
+ std::cout << " has now become " << grb::nnz( activeVertices ) << "\n";
+#endif
+ }
+
+ // check if there is a next round
+ const size_t curActive = grb::nnz( activeVertices );
+ if( ret == SUCCESS && curActive == 0 ) {
+#ifdef _DEBUG
+ std::cout << "\t All vertices are inactive; "
+ << "terminating Pregel program.\n";
+#endif
+ break;
+ }
+
+ // check if we exceed the maximum number of rounds
+ if( max_rounds > 0 && step > max_rounds ) {
+#ifdef _DEBUG
+ std::cout << "\t Maximum number of Pregel rounds met "
+ << "without the program returning a valid termination condition. "
+ << "Exiting prematurely with a FAILED error code.\n";
+#endif
+ ret = FAILED;
+ break;
+ }
+
+#ifdef _DEBUG
+ std::cout << "\t Starting message exchange\n";
+#endif
+
+ // reset halt votes
+ if( ret == SUCCESS ) {
+ ret = grb::clear( haltVotes );
+ ret = ret ? ret : grb::set< grb::descriptors::structural >(
+ haltVotes, activeVertices, false
+ );
+ }
+
+ // reset incoming buffer
+ if( ret == SUCCESS ) {
+ ret = grb::clear( in );
+ ret = ret ? ret : grb::set< grb::descriptors::structural >(
+ in, activeVertices, Id< IncomingMessageType >::value()
+ );
+ }
+
+ // execute communication
+ if( ret == SUCCESS ) {
+ ret = grb::vxm< grb::descriptors::structural >(
+ in, activeVertices, out, graph, ring
+ );
+ }
+
+ // sparsify and reset outgoing buffer
+ if( config::out_sparsify && ret == SUCCESS ) {
+ if( config::out_sparsify == config::ALWAYS ||
+ (config::out_sparsify == config::WHEN_REDUCED && out_nnz > curActive) ||
+ (config::out_sparsify == config::WHEN_HALVED && curActive <= out_nnz/2)
+ ) {
+ ret = grb::clear( out_buffer );
+ ret = ret ? ret : grb::set< grb::descriptors::structural >(
+ out_buffer, activeVertices, Id< OutgoingMessageType >::value()
+ );
+ std::swap( out, out_buffer );
+ out_nnz = curActive;
+ }
+ }
+
+#ifdef _DEBUG
+ std::cout << "\t Resetting outgoing message fields and "
+ << "starting next compute round\n";
+#endif
+
+ }
+
+#ifdef _DEBUG
+ if( grb::spmd<>::pid() == 0 ) {
+ std::cout << "Info: Pregel exits after " << step
+ << " rounds with error code " << ret
+ << " ( " << grb::toString(ret) << " )\n";
+ }
+#endif
+
+ // done
+ rounds = step;
+ return ret;
+ }
+
+ /**
+ * Queries the maximum vertex ID for programs running on this Pregel
+ * instance.
+ *
+ * @returns The maximum vertex ID.
+ */
+ size_t num_vertices() const noexcept { return n; }
+
+ /**
+ * Queries the number of edges of the graph this Pregel instance has been
+ * constructed over.
+ *
+ * @returns The number of edges within the underlying graph.
+ */
+ size_t num_edges() const noexcept { return nz; }
+
+ /**
+ * Returns the ALP/GraphBLAS matrix representation of the underlying
+ * graph.
+ *
+ * This is useful when an application prefers to sometimes use vertex-
+ * centric algorithms and other times prefers direct ALP/GraphBLAS
+ * algorithms.
+ *
+ * @returns The underlying ALP/GraphBLAS matrix corresponding to the
+ * underlying graph.
+ */
+ const grb::Matrix< MatrixEntryType > & get_matrix() const noexcept {
+ return graph;
+ }
+
+ };
+
+ } // end namespace ``grb::interfaces''
+
+} // end namespace ``grb''
+
+#endif // end ``_H_GRB_INTERFACES_PREGEL''
+
diff --git a/include/graphblas/io.hpp b/include/graphblas/io.hpp
index 9d09b95d2..8fbb70a13 100644
--- a/include/graphblas/io.hpp
+++ b/include/graphblas/io.hpp
@@ -29,6 +29,12 @@
#ifdef _GRB_WITH_REFERENCE
#include
#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/io.hpp"
+#endif
#ifdef _GRB_WITH_LPF
#include
#endif
diff --git a/include/graphblas/iomode.hpp b/include/graphblas/iomode.hpp
index d93a31c63..7bf16559e 100644
--- a/include/graphblas/iomode.hpp
+++ b/include/graphblas/iomode.hpp
@@ -15,7 +15,12 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Defines the various I/O modes a user could employ with ALP data ingestion
+ * or extraction.
+ *
* @author A. N. Yzelman
* @date 21st of February, 2017
*/
@@ -23,6 +28,7 @@
#ifndef _H_GRB_IOMODE
#define _H_GRB_IOMODE
+
namespace grb {
/**
@@ -86,8 +92,7 @@ namespace grb {
PARALLEL
};
- /** @} */
-
} // namespace grb
#endif // end ``_H_GRB_IOMODE''
+
diff --git a/include/graphblas/matrix.hpp b/include/graphblas/matrix.hpp
index 08d715df5..557a40c66 100644
--- a/include/graphblas/matrix.hpp
+++ b/include/graphblas/matrix.hpp
@@ -15,9 +15,9 @@
* limitations under the License.
*/
-/*
+/**
* @author A. N. Yzelman
- * @date 10 of August
+ * @date 10 of August, 2016
*/
#ifndef _H_GRB_MATRIX
@@ -30,6 +30,12 @@
#ifdef _GRB_WITH_REFERENCE
#include
#endif
+#ifdef _GRB_WITH_HYPERDAGS
+ #include
+#endif
+#ifdef _GRB_WITH_NONBLOCKING
+ #include "graphblas/nonblocking/matrix.hpp"
+#endif
#ifdef _GRB_WITH_LPF
#include
#endif
@@ -51,3 +57,4 @@ namespace grb {
#endif
#endif // end ``_H_GRB_MATRIX''
+
diff --git a/include/graphblas/monoid.hpp b/include/graphblas/monoid.hpp
index 56f21b1a8..bd3b65195 100644
--- a/include/graphblas/monoid.hpp
+++ b/include/graphblas/monoid.hpp
@@ -15,7 +15,11 @@
* limitations under the License.
*/
-/*
+/**
+ * @file
+ *
+ * Provides an ALP monoid.
+ *
* @author A. N. Yzelman
* @date 15 March, 2016
*/
@@ -37,12 +41,7 @@
#include
#include
-/**
- * The main Sparse Library namespace.
- *
- * All classes, enums, constants, and functions are declared in this namespace.
- * This source file only contains testing code outside this namespace.
- */
+
namespace grb {
/**
@@ -65,6 +64,7 @@ namespace grb {
"one of its input domains" );
public:
+
/** The left-hand side input domain. */
typedef typename _OP::D1 D1;
@@ -81,7 +81,9 @@ namespace grb {
template< typename IdentityType >
using Identity = _ID< IdentityType >;
+
private:
+
/**
* The underlying binary operator.
*
@@ -136,3 +138,4 @@ namespace grb {
} // namespace grb
#endif
+
diff --git a/include/graphblas/nonblocking/alloc.hpp b/include/graphblas/nonblocking/alloc.hpp
new file mode 100644
index 000000000..2938d6755
--- /dev/null
+++ b/include/graphblas/nonblocking/alloc.hpp
@@ -0,0 +1,65 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Allocator functions for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_ALLOC_NONBLOCKING
+#define _H_GRB_ALLOC_NONBLOCKING
+
+#include
+
+#include
+
+#include "config.hpp"
+
+
+namespace grb {
+
+ namespace utils {
+
+ namespace internal {
+
+ template<>
+ class Allocator< nonblocking > {
+
+ private:
+
+ /** Prevent initialisation. */
+ Allocator();
+
+ public:
+
+ /** Refer to the standard allocation mechanism. */
+ typedef AllocatorFunctions< reference > functions;
+
+ };
+
+ } // namespace internal
+
+ } // namespace utils
+
+} // namespace grb
+
+#endif
+
diff --git a/include/graphblas/nonblocking/analytic_model.hpp b/include/graphblas/nonblocking/analytic_model.hpp
new file mode 100644
index 000000000..536b3e95b
--- /dev/null
+++ b/include/graphblas/nonblocking/analytic_model.hpp
@@ -0,0 +1,122 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Configurations for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_UTILS_ANALYTIC_MODEL
+#define _H_GRB_UTILS_ANALYTIC_MODEL
+
+#include "config.hpp"
+
+
+namespace grb {
+
+ namespace internal {
+
+ /**
+ * The analytic model used for automatic tile size selection and for
+ * automatic number of threads selection.
+ */
+ class AnalyticModel {
+
+ private:
+
+ /**
+ * The size of the data type of the containers (may vary between different
+ * containers). The current design uses the maximum size of all used data
+ * types.
+ */
+ size_t size_of_data_type;
+
+ /**
+ * The size of the containers accessed in the pipeline.
+ */
+ size_t size_of_vector;
+
+ /**
+ * The number of vectors accessed in the pipeline.
+ */
+ size_t num_accessed_vectors;
+
+ /**
+ * The number of threads selected by the analytic model.
+ */
+ size_t num_threads;
+
+ /**
+ * The tile size selected by the analytic model.
+ */
+ size_t tile_size;
+
+ /**
+ * The number of total tiles that result from the selected tile size.
+ */
+ size_t num_tiles;
+
+
+ public:
+
+ /**
+ * The default constructor.
+ */
+ AnalyticModel() noexcept;
+
+ /**
+ * The parameterized constructor.
+ */
+ AnalyticModel(
+ const size_t data_type_size,
+ const size_t vector_size,
+ const size_t accessed_vectors
+ ) noexcept;
+
+ /**
+ * A getter function that returns the size of the containers.
+ */
+ size_t getVectorsSize() const noexcept;
+
+ /**
+ * A getter function that returns the number of threads selected by
+ * the analytic model.
+ */
+ size_t getNumThreads() const noexcept;
+
+ /**
+ * A getter function that returns the tile size selected by the
+ * analytic model.
+ */
+ size_t getTileSize() const noexcept;
+
+ /**
+ * A getter function that returns the number of tiles.
+ */
+ size_t getNumTiles() const noexcept;
+
+ };
+
+ }
+}
+
+#endif
+
diff --git a/include/graphblas/nonblocking/benchmark.hpp b/include/graphblas/nonblocking/benchmark.hpp
new file mode 100644
index 000000000..8b62cb016
--- /dev/null
+++ b/include/graphblas/nonblocking/benchmark.hpp
@@ -0,0 +1,95 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Nonblocking implementation of the benchmarker.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BENCH
+#define _H_GRB_NONBLOCKING_BENCH
+
+#include
+#include
+
+#include "exec.hpp"
+
+
+namespace grb {
+
+ /**
+ * The Benchmarker class is based on that of the reference backend
+ *
+ * \internal The public API simply wraps the reference Benchmarker.
+ */
+ template< enum EXEC_MODE mode >
+ class Benchmarker< mode, nonblocking > {
+
+ private:
+
+ /** \internal Reuse reference benchmarker. */
+ Benchmarker< mode, reference > ref;
+
+
+ public:
+
+ /** \internal Mirror reference constructor. */
+ Benchmarker(
+ size_t process_id = 0,
+ size_t nprocs = 1,
+ std::string hostname = "localhost",
+ std::string port = "0"
+ ) :
+ ref(process_id, nprocs, hostname, port)
+ {}
+
+ /** \internal Mirror reference exec. */
+ template< typename U >
+ RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
+ const void * data_in, const size_t in_size,
+ U &data_out,
+ const size_t inner, const size_t outer,
+ const bool broadcast = false
+ ) const {
+ return ref.exec(
+ grb_program, data_in, in_size, data_out, inner, outer, broadcast
+ );
+ }
+
+ /** \internal Mirror reference exec. */
+ template< typename T, typename U >
+ RC exec(
+ void ( *grb_program )( const T &, U & ),
+ const T &data_in, U &data_out,
+ const size_t inner,
+ const size_t outer,
+ const bool broadcast = false
+ ) {
+ return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast );
+ }
+
+ };
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_BENCH''
+
diff --git a/include/graphblas/nonblocking/blas1.hpp b/include/graphblas/nonblocking/blas1.hpp
new file mode 100644
index 000000000..f9f14cafc
--- /dev/null
+++ b/include/graphblas/nonblocking/blas1.hpp
@@ -0,0 +1,11489 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Level-1 primitive implementation for nonblocking.
+ *
+ * \internal
+ * \todo Relies significantly on a past reference level-1 implementation. Can we
+ * reuse?
+ * \endinternal
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS1
+#define _H_GRB_NONBLOCKING_BLAS1
+
+#include //for printing to stderr
+#include //for std::enable_if
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "vector_wrapper.hpp"
+#include "boolean_dispatcher_blas1.hpp"
+
+#define NO_CAST_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template parameters " \
+ "in this call to " y ".\n" \
+ "* Possible fix 2 | Provide a value that matches the expected type.\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" );
+
+#define NO_CAST_OP_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template parameters " \
+ "in this call to " y ".\n" \
+ "* Possible fix 2 | For all mismatches in the domains of input " \
+ "parameters and the operator domains, as specified in the " \
+ "documentation of the function " y ", supply an input argument of " \
+ "the expected type instead.\n" \
+ "* Possible fix 3 | Provide a compatible operator where all domains " \
+ "match those of the input parameters, as specified in the " \
+ "documentation of the function " y ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" );
+
+
+namespace grb {
+
+ namespace internal {
+
+ extern LazyEvaluation le;
+
+ }
+
+}
+
+namespace grb {
+
+ /**
+ * \defgroup BLAS1_NB The Level-1 ALP/GraphBLAS routines -- nonblocking backend
+ *
+ * @{
+ */
+
+ namespace internal {
+
+ template<
+ bool left,
+ class Monoid,
+ typename InputType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_dense(
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Monoid &monoid
+ ) {
+ const InputType *__restrict__ const raw = internal::getRaw( to_fold );
+
+ const size_t start = lower_bound;
+ const size_t end = upper_bound;
+
+ if( start < end ) {
+ if( left ) {
+ monoid.getOperator().foldlArray(
+ thread_local_output, raw + start, end - start );
+ } else {
+ monoid.getOperator().foldrArray(
+ raw + start, thread_local_output, end - start );
+ }
+ }
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_vectorDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ const size_t n = internal::getCoordinates( to_fold ).size();
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_to_fold_nz = ( already_dense_input_to_fold )
+ ? local_n
+ : local_to_fold.nonzeroes();
+
+ assert( n > 0 );
+ assert( !masked || internal::getCoordinates( mask ).size() == n );
+
+#ifdef NDEBUG
+ (void) n;
+ (void) local_n;
+#endif
+
+ RC ret = SUCCESS;
+
+ const size_t start = 0;
+ const size_t end = local_to_fold_nz;
+
+ // compute thread-local partial reduction
+ for( size_t k = start; k < end; ++k ) {
+ const size_t i = ( (already_dense_input_to_fold)
+ ? k
+ : local_to_fold.index( k ) ) + lower_bound;
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !utils::interpretMask< descr >(
+ internal::getCoordinates( mask ).assigned( i ),
+ internal::getRaw( mask ), i )
+ ) {
+ continue;
+ }
+ } else {
+ if( !utils::interpretMask< descr >(
+ local_mask.assigned( i - lower_bound ), internal::getRaw( mask ), i )
+ ) {
+ continue;
+ }
+ }
+ }
+ RC local_rc;
+ if( left ) {
+ local_rc = foldl< descr >( thread_local_output,
+ internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+ } else {
+ local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+ thread_local_output, monoid.getOperator() );
+ }
+ assert( local_rc == SUCCESS );
+ if( local_rc != SUCCESS ) {
+ ret = local_rc;
+ }
+ }
+
+ return ret;
+ }
+
+ template<
+ Descriptor descr,
+ bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ const size_t n = internal::getCoordinates( to_fold ).size();
+
+ assert( internal::getCoordinates( mask ).size() == n );
+ assert( n > 0 );
+#ifdef NDEBUG
+ (void) n;
+#endif
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_mask_nz = ( already_dense_mask )
+ ? local_n
+ : local_mask.nonzeroes();
+
+ RC ret = SUCCESS;
+
+ const size_t start = 0;
+ const size_t end = local_mask_nz;
+
+ // compute thread-local partial reduction
+ for( size_t k = start; k < end; ++k ) {
+ const size_t i = ( (already_dense_mask)
+ ? k
+ : local_mask.index( k )
+ ) + lower_bound;
+ if( !( already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound ) )
+ ) {
+ continue;
+ }
+ if( !utils::interpretMask< descr >( true, internal::getRaw( mask ), i ) ) {
+ continue;
+ }
+ RC local_rc;
+ if( left ) {
+ local_rc = foldl< descr >( thread_local_output,
+ internal::getRaw( to_fold )[ i ], monoid.getOperator() );
+ } else {
+ local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ],
+ thread_local_output, monoid.getOperator() );
+ }
+ assert( local_rc == SUCCESS );
+ if( local_rc != SUCCESS ) {
+ ret = local_rc;
+ }
+ }
+
+ return ret;
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_fullLoopSparse(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+#ifdef _DEBUG
+ std::cout << "Entered fold_from_vector_to_scalar_fullLoopSparse\n";
+#endif
+
+#ifndef NDEBUG
+ const size_t n = internal::getCoordinates( to_fold ).size();
+ const size_t local_n = already_dense_input_to_fold
+ ? upper_bound - lower_bound
+ : local_to_fold.size();
+ assert( local_n > 0 );
+
+ (void) n;
+#endif
+ RC ret = SUCCESS;
+
+ size_t i = lower_bound;
+ const size_t end = upper_bound;
+
+ // some sanity checks
+ assert( i <= end );
+ assert( end <= n );
+
+ // assume current i needs to be processed, forward until we find an index
+ // for which the mask evaluates true
+ bool process_current_i = true;
+ if( masked && i < end ) {
+ process_current_i = utils::interpretMask< descr >(
+ already_dense_mask
+ ? internal::getCoordinates( mask ).assigned( i )
+ : local_mask.assigned( i - lower_bound ),
+ internal::getRaw( mask ), i ) && (
+ already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+ );
+ // if not
+ while( !process_current_i ) {
+ // forward to next element
+ (void) ++i;
+ // check that we are within bounds
+ if( i == end ) {
+ break;
+ }
+ // evaluate whether we should process this i-th element
+ process_current_i = utils::interpretMask< descr >(
+ already_dense_mask
+ ? internal::getCoordinates( mask ).assigned( i )
+ : local_mask.assigned( i - lower_bound ),
+ internal::getRaw( mask ), i ) && (
+ already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound )
+ );
+ }
+ }
+
+ if( !masked && i < end ) {
+ process_current_i = local_to_fold.assigned( i - lower_bound );
+ while( !process_current_i ) {
+ (void) ++i;
+ if( i == end ) {
+ break;
+ }
+ process_current_i = already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound );
+ }
+ }
+
+#ifndef NDEBUG
+ if( i < end ) {
+ assert( i < n );
+ }
+#endif
+
+ // declare thread-local variable and set our variable to the first value in
+ // our block
+ typename Monoid::D3 local =
+ monoid.template getIdentity< typename Monoid::D3 >();
+ if( end > 0 ) {
+ if( i < end ) {
+#ifdef _DEBUG
+ std::cout << "\t processing start index " << i << "\n";
+#endif
+ local = static_cast< typename Monoid::D3 >(
+ internal::getRaw( to_fold )[ i ] );
+ }
+ }
+
+ // if we have more values to fold
+ if( i + 1 < end ) {
+
+ // keep going until we run out of values to fold
+ while( true ) {
+
+ // forward to next variable
+ (void) ++i;
+
+ // forward more (possibly) if in the masked case
+ if( masked && i < end ) {
+ assert( i < n );
+ process_current_i = utils::interpretMask< descr >(
+ already_dense_mask
+ ? internal::getCoordinates( mask ).assigned( i )
+ : local_mask.assigned( i - lower_bound ),
+ internal::getRaw( mask ), i
+ ) && (
+ already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound )
+ );
+ while( !process_current_i ) {
+ (void) ++i;
+ if( i == end ) {
+ break;
+ }
+ assert( i < end );
+ assert( i < n );
+ process_current_i = utils::interpretMask< descr >(
+ already_dense_mask
+ ? internal::getCoordinates( mask ).assigned( i )
+ : local_mask.assigned( i - lower_bound ),
+ internal::getRaw( mask ), i
+ ) && (
+ already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound )
+ );
+ }
+ }
+ if( !masked && i < end ) {
+ assert( i < n );
+ process_current_i = already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound );
+ while( !process_current_i ) {
+ (void) ++i;
+ if( i == end ) {
+ break;
+ }
+ assert( i < end );
+ assert( i < n );
+ process_current_i = already_dense_input_to_fold ||
+ local_to_fold.assigned( i - lower_bound );
+ }
+ }
+
+ // stop if past end
+ if( i >= end ) {
+ break;
+ }
+
+#ifdef _DEBUG
+ std::cout << "\t processing index " << i << "\n";
+#endif
+
+ // do fold
+ assert( i < n );
+ if( left ) {
+ ret = ret ? ret : foldl< descr >( local, internal::getRaw( to_fold )[ i ],
+ monoid.getOperator() );
+ } else {
+ ret = ret ? ret : foldr< descr >( internal::getRaw( to_fold )[ i ], local,
+ monoid.getOperator() );
+ }
+ assert( ret == SUCCESS );
+
+ if( ret != SUCCESS ) {
+ break;
+ }
+ }
+ }
+
+ if( left ) {
+ ret = ret ? ret : foldl< descr >( thread_local_output, local,
+ monoid.getOperator() );
+ } else {
+ ret = ret ? ret : foldr< descr >( local, thread_local_output,
+ monoid.getOperator() );
+ }
+ assert( ret == SUCCESS );
+
+ return ret;
+ }
+
+ /**
+ * Dispatches to any of the four above variants depending on asymptotic cost
+ * analysis.
+ */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool masked,
+ bool left, // if this is false, assumes right-looking fold
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC fold_from_vector_to_scalar_generic(
+ IOType &fold_into,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ // static sanity checks
+ static_assert( grb::is_monoid< Monoid >::value,
+ "grb::foldl can only be called using monoids. This "
+ "function should not have been called-- please submit a "
+ "bugreport." );
+
+ const size_t n = internal::getCoordinates( to_fold ).size();
+
+ // mask must be of equal size as input vector
+ if( masked && n != size( mask ) ) {
+ return MISMATCH;
+ }
+
+ // handle trivial cases
+ if( n == 0 ) {
+ return SUCCESS;
+ }
+
+ // some globals used during the folding
+ RC ret = SUCCESS;
+ typename Monoid::D3 global =
+ monoid.template getIdentity< typename Monoid::D3 >();
+
+ size_t local_reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+ config::CACHE_LINE_SIZE::value();
+ IOType local_reduced[ local_reduced_size ];
+
+ for(
+ size_t i = 0;
+ i < local_reduced_size;
+ i += config::CACHE_LINE_SIZE::value()
+ ) {
+ local_reduced[ i ] = monoid.template getIdentity< typename Monoid::D3 >();
+ }
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&to_fold, &mask, &monoid, &local_reduced] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound,
+ const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage fold_from_vector_to_scalar_generic "
+ "in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC ret = SUCCESS;
+
+ Coords local_to_fold, local_mask;
+ size_t local_n = upper_bound - lower_bound;
+ size_t local_to_fold_nz = local_n;
+ size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input_to_fold = true;
+ bool already_dense_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_to_fold = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( to_fold ) );
+ if( !already_dense_input_to_fold ) {
+#else
+ already_dense_input_to_fold = false;
+#endif
+ local_to_fold = internal::getCoordinates( to_fold ).asyncSubset(
+ lower_bound, upper_bound );
+ local_to_fold_nz = local_to_fold.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+ }
+
+ unsigned int thread_id = omp_get_thread_num() *
+ config::CACHE_LINE_SIZE::value();
+
+ // dispatch, dense variant
+ if( ( (descr & descriptors::dense) || local_to_fold_nz == local_n ) && (
+ !masked || (
+ (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) &&
+ local_mask_nz == local_n
+ )
+ )
+ ) {
+#ifdef _DEBUG
+ std::cout << "\t dispatching to dense variant\n";
+#endif
+ ret = fold_from_vector_to_scalar_dense< left >(
+ local_reduced[ thread_id ], lower_bound, upper_bound, to_fold, monoid );
+ } else if( masked && (descr & descriptors::invert_mask ) ) {
+ // in this case we are forced to dispatch to O(n)
+#ifdef _DEBUG
+ std::cout << "\t forced dispatch to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+ ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+ descr, true, left
+ >(
+ already_dense_input_to_fold, already_dense_mask,
+ local_reduced[ thread_id ], lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ constexpr const size_t threeWs =
+ sizeof( typename Coords::StackType ) +
+ sizeof( typename Coords::ArrayType ) +
+ MaskWordSize< descr, MaskType >::value;
+ const size_t fullLoop = masked
+ ? 2 * sizeof( typename Coords::ArrayType ) * local_n +
+ sizeof( MaskType ) * local_mask_nz
+ : sizeof( typename Coords::ArrayType ) * local_n;
+ const size_t vectorLoop = masked
+ ? threeWs * local_to_fold_nz
+ : sizeof( typename Coords::StackType ) * local_to_fold_nz;
+ const size_t maskLoop = masked
+ ? threeWs * local_mask_nz
+ : std::numeric_limits< size_t >::max();
+ if( fullLoop >= vectorLoop && maskLoop >= vectorLoop ) {
+#ifdef _DEBUG
+ std::cout << "\t dispatching to vector-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ ret = boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven<
+#else
+ ret = fold_from_vector_to_scalar_vectorDriven<
+#endif
+ descr, masked, left
+ >(
+ already_dense_input_to_fold, already_dense_mask,
+ local_reduced[ thread_id ], lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else if( vectorLoop >= fullLoop && maskLoop >= fullLoop ) {
+#ifdef _DEBUG
+ std::cout << "\t dispatching to O(n) sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse<
+#else
+ ret = fold_from_vector_to_scalar_fullLoopSparse<
+#endif
+ descr, masked, left
+ >(
+ already_dense_input_to_fold, already_dense_mask,
+ local_reduced[ thread_id ], lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ assert( maskLoop < fullLoop && maskLoop < vectorLoop );
+ assert( masked );
+#ifdef _DEBUG
+ std::cout << "\t dispatching to mask-driven sparse variant\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ ret = boolean_dispatcher_fold_from_vector_to_scalar_maskDriven<
+#else
+ ret = fold_from_vector_to_scalar_maskDriven<
+#endif
+ descr, left
+ >(
+ already_dense_input_to_fold, already_dense_mask,
+ local_reduced[ thread_id ], lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ }
+
+ return ret;
+ };
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: "
+ << "fold_from_vector_to_scalar_generic" << std::endl;
+#endif
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_VECTOR_SCALAR_GENERIC,
+ n,
+ sizeof( IOType ),
+ dense_descr,
+ true,
+ nullptr, nullptr, nullptr, nullptr,
+ &to_fold,
+ ( masked ) ? &mask : nullptr,
+ nullptr,
+ nullptr,
+ &internal::getCoordinates( to_fold ),
+ (masked) ? &internal::getCoordinates( mask ) : nullptr,
+ nullptr,
+ nullptr,
+ nullptr
+ );
+
+ if( ret == SUCCESS ) {
+ for(
+ size_t i = 0;
+ i < local_reduced_size;
+ i += config::CACHE_LINE_SIZE::value()
+ ) {
+ RC rc;
+ if( left ) {
+ rc = foldl< descr >( global, local_reduced[ i ], monoid.getOperator() );
+ } else {
+ rc = foldr< descr >( local_reduced[ i ], global, monoid.getOperator() );
+ }
+ assert( rc == SUCCESS );
+ if( rc != SUCCESS ) {
+ ret = rc;
+ }
+ }
+ }
+
+ // accumulate
+#ifdef _DEBUG
+ std::cout << "\t accumulating " << global << " into " << fold_into << "\n";
+#endif
+
+ if( ret == SUCCESS ) {
+ if( left ) {
+ ret = foldl< descr >( fold_into, global, monoid.getOperator() );
+ } else {
+ ret = foldr< descr >( global, fold_into, monoid.getOperator() );
+ }
+ }
+
+ return ret;
+ }
+
+ /**
+ * \internal
+ * @tparam left If false, right-looking fold is assumed (and left-looking
+ * otherwise)
+ * @tparam sparse Whether \a vector was sparse
+ * @tparam monoid Whether \a op is actually a monoid
+ * \endinternal
+ */
+ template<
+ Descriptor descr,
+ bool left,
+ bool sparse,
+ bool masked,
+ bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+#endif
+ typename MaskType,
+ typename IOType,
+ typename InputType,
+ typename Coords,
+ class OP
+ >
+ RC fold_from_scalar_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_vector,
+ const Coords * const local_mask_ptr,
+ Vector< IOType, nonblocking, Coords > &vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask,
+ const InputType &scalar,
+ const OP &op,
+ const Phase &phase
+ ) {
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ assert( !masked || mask != nullptr );
+ assert( !masked || local_mask_ptr != nullptr );
+
+ Coords local_mask;
+ if( masked ) {
+ local_mask = *local_mask_ptr;
+ }
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_vector_nz = (sparse || !already_dense_output)
+ ? local_vector.nonzeroes() : local_n;
+ const size_t local_mask_nz = ( masked )
+ ? ( ( already_dense_mask )
+ ? local_n
+ : local_mask.nonzeroes()
+ )
+ : 0;
+
+ const size_t n = internal::getCoordinates( vector ).size();
+
+ if( masked && internal::getCoordinates( *mask ).size() != n ) {
+ return MISMATCH;
+ }
+ if( dense_descr && sparse ) {
+ return ILLEGAL;
+ }
+ if( n == 0 ) {
+ return SUCCESS;
+ }
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ assert( phase == EXECUTE );
+ IOType * __restrict__ const x = internal::getRaw( vector );
+ const MaskType * __restrict__ const m = ( masked )
+ ? internal::getRaw( *mask )
+ : nullptr;
+
+ if( sparse && monoid && !masked ) {
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ if( already_dense_output || local_vector.assigned( i - lower_bound ) ) {
+ if( left ) {
+ (void) foldl< descr >( x[ i ], scalar, op );
+ } else {
+ (void) foldr< descr >( scalar, x[ i ], op );
+ }
+ } else {
+ x[ i ] = static_cast< IOType >( scalar );
+ }
+ }
+
+ if( !already_dense_output ) {
+ local_vector.local_assignAllNotAlreadyAssigned();
+ }
+ } else if( sparse && monoid && masked ) {
+ for( size_t i = 0; i < local_mask_nz; ++i ) {
+ const size_t index = ( ( already_dense_mask )
+ ? i
+ : local_mask.index( i ) ) + lower_bound;
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *mask ).template mask< descr >(
+ index, m )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound,
+ m + lower_bound )
+ ) {
+ continue;
+ }
+ }
+ if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+ if( left ) {
+ (void) foldl< descr >( x[ index ], scalar, op );
+ } else {
+ (void) foldr< descr >( scalar, x[ index ], op );
+ }
+ } else {
+ x[ index ] = static_cast< IOType >( scalar );
+ }
+ }
+ } else if( sparse && !monoid ) {
+ const bool maskDriven = masked ? local_mask_nz < local_vector_nz : false;
+ if( maskDriven ) {
+ for( size_t i = 0; i < local_mask_nz; ++i ) {
+ const size_t index = ( ( already_dense_mask )
+ ? i
+ : local_mask.index( i ) ) + lower_bound;
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *mask ).template mask< descr >(
+ index, m )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound,
+ m + lower_bound )
+ ) {
+ continue;
+ }
+ }
+ if( already_dense_output || local_vector.assign( index - lower_bound ) ) {
+ if( left ) {
+ (void) foldl< descr >( x[ index ], scalar, op );
+ } else {
+ (void) foldr< descr >( scalar, x[ index ], op );
+ }
+ }
+ }
+ } else {
+ for( size_t i = 0; i < local_vector_nz; ++i ) {
+ const size_t index = (already_dense_output
+ ? i
+ : local_vector.index( i )
+ ) + lower_bound;
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !( internal::getCoordinates( *mask ).template mask< descr >(
+ index, m ) )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound, m +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+ }
+ if( left ) {
+ (void) foldl< descr >( x[ index ], scalar, op );
+ } else {
+ (void) foldr< descr >( scalar, x[ index ], op );
+ }
+ }
+ }
+ } else if( !sparse && masked ) {
+ for( size_t i = 0; i < local_mask_nz; ++i ) {
+ const size_t index = ( ( already_dense_mask )
+ ? i
+ : local_mask.index( i ) ) + lower_bound;
+ if( already_dense_mask ) {
+ if( !( internal::getCoordinates( *mask ).template mask< descr >(
+ index, m ) )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound, m +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+
+ if( left ) {
+ (void) foldl< descr >( x[ index ], scalar, op );
+ } else {
+ (void) foldr< descr >( scalar, x[ index ], op );
+ }
+ }
+ } else {
+ // if target vector is dense and there is no mask, then
+ // there is no difference between monoid or non-monoid behaviour.
+ assert( !sparse );
+ assert( !masked );
+ assert( local_vector_nz == local_n );
+
+ if( local_n > 0 ) {
+ if( left ) {
+ op.eWiseFoldlAS( x + lower_bound, scalar, local_n );
+ } else {
+ op.eWiseFoldrSA( scalar, x + lower_bound, local_n );
+ }
+ }
+ }
+
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr,
+ bool left, // if this is false, the right-looking fold is assumed
+ bool sparse,
+ bool masked,
+ bool monoid,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ typename MaskType,
+ typename IOType,
+ typename IType,
+ typename Coords,
+ class OP
+ >
+ RC fold_from_vector_to_vector_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_fold_into,
+ const Coords * const local_m_ptr,
+ const Coords &local_to_fold,
+ Vector< IOType, nonblocking, Coords > &fold_into,
+ const Vector< MaskType, nonblocking, Coords > * const m,
+ const Vector< IType, nonblocking, Coords > &to_fold,
+ const OP &op,
+ const Phase phase
+ ) {
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ assert( !masked || (m != nullptr) );
+
+ Coords local_m;
+ if( masked && !already_dense_mask ) {
+ local_m = *local_m_ptr;
+ }
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_fold_into_nz = already_dense_output
+ ? local_n
+ : local_fold_into.nonzeroes();
+ const size_t local_to_fold_nz = already_dense_input_to_fold
+ ? local_n
+ : local_to_fold.nonzeroes();
+ const size_t local_m_nz = ( masked )
+ ? ( already_dense_mask
+ ? local_n
+ : local_m.nonzeroes()
+ )
+ : 0;
+
+ const size_t n = size( fold_into );
+ if( n != size( to_fold ) ) {
+ return MISMATCH;
+ }
+ if( masked && size( *m ) != n ) {
+ return MISMATCH;
+ }
+ if( dense_descr && sparse ) {
+ return ILLEGAL;
+ }
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ assert( phase == EXECUTE );
+
+ if( !sparse && !masked ) {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: in dense variant\n";
+#endif
+
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: in sequential variant\n";
+#endif
+ if( left ) {
+ op.eWiseFoldlAA( internal::getRaw( fold_into ) + lower_bound,
+ internal::getRaw( to_fold ) + lower_bound, local_n );
+ } else {
+ op.eWiseFoldrAA( internal::getRaw( to_fold ) + lower_bound,
+ internal::getRaw( fold_into ) + lower_bound, local_n );
+ }
+ } else {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: in sparse variant\n";
+ std::cout << "\tfolding vector of " << local_to_fold_nz << " nonzeroes "
+ << "into a vector of " << local_fold_into_nz << " nonzeroes...\n";
+#endif
+ if(
+ masked &&
+ local_fold_into_nz == local_n &&
+ local_to_fold_nz == local_n
+ ) {
+ // use sparsity structure of mask for this eWiseFold
+ if( left ) {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+ << "mask's sparsity structure\n";
+#endif
+ for( size_t k = 0; k < local_m_nz; ++k ) {
+ const size_t i = ( already_dense_mask
+ ? k
+ : local_m.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Left-folding " << to_fold[ i ] << " into "
+ << fold_into[ i ];
+#endif
+ (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ } else {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldl, using the "
+ << "mask's sparsity structure\n";
+#endif
+ for( size_t k = 0; k < local_m_nz; ++k ) {
+ const size_t i = ( already_dense_mask
+ ? k
+ : local_m.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Right-folding " << to_fold[ i ] << " into "
+ << fold_into[ i ];
+#endif
+ (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ }
+ } else if( !masked && local_fold_into_nz == local_n ) {
+ // use sparsity structure of to_fold for this eWiseFold
+ if( left ) {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+ << "to_fold's sparsity\n";
+#endif
+ for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+ const size_t i = ( already_dense_input_to_fold
+ ? k
+ : local_to_fold.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Left-folding " << to_fold[ i ] << " into "
+ << fold_into[ i ];
+#endif
+ (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ } else {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+ << "to_fold's sparsity\n";
+#endif
+ for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+ const size_t i = ( already_dense_input_to_fold
+ ? k
+ : local_to_fold.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Right-folding " << to_fold[ i ] << " into "
+ << fold_into[ i ];
+#endif
+ (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ }
+ } else if( !masked && local_to_fold_nz == local_n ) {
+ // use sparsity structure of fold_into for this eWiseFold
+ if( left ) {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldl, using "
+ << "fold_into's sparsity\n";
+#endif
+ for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+ const size_t i = ( already_dense_output
+ ? k
+ : local_fold_into.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Left-folding " << to_fold[ i ] << " into "
+ << fold_into[ i ];
+#endif
+ (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ } else {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: foldr, using "
+ << "fold_into's sparsity\n";
+#endif
+ for( size_t k = 0; k < local_fold_into_nz; ++k ) {
+ const size_t i = ( already_dense_output ?
+ k :
+ local_fold_into.index( k )
+ ) + lower_bound;
+#ifdef _DEBUG
+ std::cout << "Right-folding " << to_fold[ i ] << " into " << fold_into[ i ];
+#endif
+ (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op );
+#ifdef _DEBUG
+ std::cout << " resulting into " << fold_into[ i ] << "\n";
+#endif
+ }
+ }
+ } else {
+#ifdef _DEBUG
+ std::cout << "fold_from_vector_to_vector_generic: using specialised "
+ << "code to merge two sparse vectors and, potentially, "
+ << "output masks\n";
+#endif
+ const IType * __restrict__ const tf_raw = internal::getRaw( to_fold );
+ IOType * __restrict__ const fi_raw = internal::getRaw( fold_into );
+#ifdef _DEBUG
+ std::cout << "\tin sequential version...\n";
+#endif
+ for( size_t k = 0; k < local_to_fold_nz; ++k ) {
+ const size_t i = ( already_dense_input_to_fold
+ ? k
+ : local_to_fold.index( k )
+ ) + lower_bound;
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *m ).template mask< descr >( i,
+ internal::getRaw( *m ) )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_m.template mask< descr >( i - lower_bound,
+ internal::getRaw( *m ) + lower_bound )
+ ) {
+ continue;
+ }
+ }
+ }
+
+ assert( i < n );
+ if( already_dense_output ||
+ local_fold_into.assigned( i - lower_bound )
+ ) {
+ if( left ) {
+#ifdef _DEBUG
+ std::cout << "\tfoldl< descr >( fi_raw[ i ], tf_raw[ i ], op ), i = "
+ << i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+ (void)foldl< descr >( fi_raw[ i ], tf_raw[ i ], op );
+#ifdef _DEBUG
+ std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+ } else {
+#ifdef _DEBUG
+ std::cout << "\tfoldr< descr >( tf_raw[ i ], fi_raw[ i ], op ), i = "
+ << i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ];
+#endif
+ (void) foldr< descr >( tf_raw[ i ], fi_raw[ i ], op );
+#ifdef _DEBUG
+ std::cout << " which results in " << fi_raw[ i ] << "\n";
+#endif
+ }
+ } else if( monoid ) {
+#ifdef _DEBUG
+ std::cout << "\tindex " << i << " is unset. Old value " << fi_raw[ i ]
+ << " will be overwritten with " << tf_raw[ i ] << "\n";
+#endif
+ fi_raw[ i ] = tf_raw[ i ];
+ (void) local_fold_into.assign( i - lower_bound );
+ }
+ }
+ }
+ }
+
+#ifdef _DEBUG
+ std::cout << "\tCall to fold_from_vector_to_vector_generic done. "
+ << "Output now contains " << local_fold_into_nz << " / "
+ << local_n << " nonzeroes.\n";
+#endif
+ return SUCCESS;
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename InputType,
+ typename IOType,
+ typename MaskType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ IOType &beta,
+ const Monoid &monoid = Monoid(),
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, InputType >::value ), "grb::foldr",
+ "called with a scalar IO type that does not match the input vector type" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+ "called with an input vector value type that does not match the first "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+ "called with an input vector type that does not match the second domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+ "called with an input vector type that does not match the third domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::foldr",
+ "called with a vector mask type that is not boolean" );
+
+ if( size( mask ) > 0 ) {
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, true, false
+ >( beta, x, mask, monoid );
+ } else {
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, false, false
+ >( beta, x, mask, monoid );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename InputType,
+ typename IOType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ IOType &beta,
+ const Monoid &monoid = Monoid(),
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, InputType >::value ), "grb::foldr",
+ "called with a scalar IO type that does not match the input vector type" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr",
+ "called with an input vector value type that does not match the first "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr",
+ "called with an input vector type that does not match the second domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr",
+ "called with an input vector type that does not match the third domain of "
+ "the given monoid" );
+
+ Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, false, false
+ >( beta, x, empty_mask, monoid );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const InputType &alpha,
+ Vector< IOType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [alpha, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(alpha, y, monoid) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_y;
+ size_t local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+
+ if( !already_dense_vectors ) {
+ const size_t local_n = upper_bound - lower_bound;
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector( &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, false, true, false, true
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound, local_y, local_null_mask,
+ y, null_mask, alpha, monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, false, false, false, true
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound, local_y, local_null_mask,
+ y, null_mask, alpha, monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+ internal::getCoordinates( y ).size(),
+ sizeof( IOType ),
+ dense_descr, true,
+ &y, nullptr,
+ &internal::getCoordinates( y ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const InputType &alpha,
+ Vector< IOType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [alpha, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ {
+ std::cout << "\t\tExecution of stage foldl(alpha, y, op) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+ }
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, false, true, false, false
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound,
+ local_y, local_null_mask, y, null_mask,
+ alpha, op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, false, false, false, false
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound, local_y, local_null_mask,
+ y, null_mask, alpha, op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+ internal::getCoordinates( y ).size(),
+ sizeof( IOType ),
+ dense_descr, true,
+ &y, nullptr,
+ &internal::getCoordinates( y ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, op)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ Vector< IOType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< OP >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value,
+ void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the third domain "
+ "of the given operator" );
+
+ const size_t n = size( x );
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+
+#ifdef _DEBUG
+ std::cout << "In foldr ([T]<-[T])\n";
+#endif
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldr(x, y, operator) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, true, false, false
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_y, local_null_mask,
+ local_x, y,
+ null_mask, x,
+ op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, false, false, false
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_y, local_null_mask,
+ local_x,
+ y, null_mask,
+ x,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ),
+ dense_descr, true,
+ &y, nullptr,
+ &internal::getCoordinates( y ), nullptr,
+ &x, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldr(x, y, operator)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ Vector< IOType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_operator< OP >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< IOType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+ "called with a non-Boolean mask" );
+
+ if( size( m ) == 0 ) {
+ return foldr< descr >( x, y, op, phase );
+ }
+
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldr(x, m, y, operator) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_m, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_mask = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, true, true, false
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_y, &local_m, local_x,
+ y, &m, x,
+ op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, false, true, false
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_y, &local_m, local_x,
+ y, &m, x,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ),
+ dense_descr, true,
+ &y, nullptr, &internal::getCoordinates( y ), nullptr,
+ &x, &m, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( m ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, operator)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ Vector< IOType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value,
+ void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the third domain "
+ "of the given monoid" );
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldr(x, y, monoid) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, true, false, true
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_y, local_null_mask, local_x,
+ y, null_mask, x,
+ monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, false, false, true
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_y, local_null_mask, local_x,
+ y, null_mask, x,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &y, nullptr, &internal::getCoordinates( y ), nullptr,
+ &x, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldr(x, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldr(
+ const Vector< InputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ Vector< IOType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< IOType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr",
+ "called on a vector y of a type that does not match the third domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr",
+ "called with a mask of non-Boolean type" );
+
+ // check empty mask
+ if( size( m ) == 0 ) {
+ return foldr< descr >( x, y, monoid, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldr(x, m, y, monoid) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_m, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_mask = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, true, true, true
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_y, &local_m, local_x,
+ y, &m, x,
+ monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, false, false, true, true
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_y, &local_m, local_x,
+ y, &m, x,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &y, nullptr, &internal::getCoordinates( y ), nullptr,
+ &x, &m, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( m ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Op,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const InputType beta,
+ const Op &op = Op(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_operator< Op >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D1, IOType >::value ),
+ "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D2, InputType >::value ),
+ "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D3, IOType >::value ),
+ "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, beta, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, beta, op) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, true, false, false
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask,
+ x, null_mask,
+ beta,
+ op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, false, false, false
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask,
+ x, null_mask, beta,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+ internal::getCoordinates( x ).size(), sizeof( IOType ),
+ dense_descr, true,
+ &x, nullptr,
+ &internal::getCoordinates( x ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, beta, op)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Op,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType beta,
+ const Op &op = Op(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_operator< Op >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D1, IOType >::value ),
+ "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D2, InputType >::value ),
+ "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Op::D3, IOType >::value ),
+ "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting ) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::foldl (reference, vector <- scalar, masked)",
+ "provided mask does not have boolean entries" );
+
+ // check empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, beta, op, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( x );
+ if( size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ // catch trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, beta, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, m, beta, op) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_mask;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, true, true, false
+ >(
+ already_dense_output, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_mask,
+ x, &m,
+ beta,
+ op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, false, true, false
+ >(
+ already_dense_output, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_mask,
+ x, &m,
+ beta,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+ n, sizeof( IOType ),
+ dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &m, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, op)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const InputType beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given monoid" );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, beta, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, beta, monoid) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, true, false, true
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask,
+ x, null_mask,
+ beta,
+ monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, false, false, true
+ >(
+ already_dense_output, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask,
+ x, null_mask,
+ beta,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC,
+ internal::getCoordinates( x ).size(), sizeof( IOType ),
+ dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, beta, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType &beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::foldl (nonblocking, vector <- scalar, masked, monoid)",
+ "provided mask does not have boolean entries" );
+
+ // check for empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, beta, monoid, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( x );
+ if( n != size( m ) ) { return MISMATCH; }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, beta, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, m, beta, monoid) in the "
+ << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_m;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, true, true, true
+ >(
+ already_dense_output, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m,
+ x, &m,
+ beta,
+ monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic<
+#else
+ rc = internal::fold_from_scalar_to_vector_generic<
+#endif
+ descr, true, false, true, true
+ >(
+ already_dense_output, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m,
+ x, &m,
+ beta,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC,
+ internal::getCoordinates( x ).size(), sizeof( IOType ),
+ dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &m, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_operator< OP >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( (!( descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, y, operator) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, true, false, false
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask, local_y,
+ x, null_mask, y,
+ op, phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, false, false, false
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask, local_y,
+ x, null_mask, y,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, y, operator)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, y, monoid) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz, local_y_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, true, false, true
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask, local_y,
+ x, null_mask, y,
+ monoid.getOperator(), phase
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, false, false, true
+ >(
+ already_dense_output, already_dense_input, true,
+ lower_bound, upper_bound,
+ local_x, local_null_mask, local_y,
+ x, null_mask, y,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_operator< OP >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::foldl",
+ "called with a mask that does not have boolean entries " );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, y, op, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, &y, &op, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, m, y, op) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y, local_m;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+ bool already_dense_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, true, true, false
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m, local_y,
+ x, &m, y,
+ op, phase
+ );
+ } else {
+ assert( local_x_nz == local_n );
+ assert( local_y_nz == local_n );
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, false, true, false
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m, local_y,
+ x, &m, y,
+ op, phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ), dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &y, &m, nullptr, nullptr,
+ &internal::getCoordinates( y ), &internal::getCoordinates( m ), nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, op)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ Vector< IOType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< grb::is_monoid< Monoid >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value, void
+ >::type * = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl",
+ "called with a vector x of a type that does not match the first domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl",
+ "called on a vector y of a type that does not match the second domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl",
+ "called on a vector x of a type that does not match the third domain "
+ "of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::foldl",
+ "called with a mask that does not have boolean entries" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return foldl< descr >( x, y, monoid, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( x );
+ if( n != size( y ) || n != size( m ) ) {
+ return MISMATCH;
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &m, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage foldl(x, m, y, monoid) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y, local_m;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_input = true;
+ bool already_dense_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, true, true, true
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m, local_y,
+ x, &m, y,
+ monoid.getOperator(), phase
+ );
+ } else {
+ assert( local_x_nz == local_n );
+ assert( local_y_nz == local_n );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic<
+#else
+ rc = internal::fold_from_vector_to_vector_generic<
+#endif
+ descr, true, false, true, true
+ >(
+ already_dense_output, already_dense_input, already_dense_mask,
+ lower_bound, upper_bound,
+ local_x, &local_m, local_y,
+ x, &m, y,
+ monoid.getOperator(), phase
+ );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC,
+ n, sizeof( IOType ),
+ dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &y, &m, nullptr, nullptr,
+ &internal::getCoordinates( y ), &internal::getCoordinates( m ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ namespace internal {
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr, class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC dense_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ ) {
+#ifdef _DEBUG
+ std::cout << "\t internal::dense_apply_generic called\n";
+#endif
+ static_assert( !(left_scalar && left_sparse),
+ "The left-hand side must be scalar OR sparse, but cannot be both!" );
+ static_assert( !(right_scalar && right_sparse),
+ "The right-hand side must be scalar OR sparse, but cannot be both!" );
+ static_assert( !(left_sparse && right_sparse),
+ "If both left- and right-hand sides are sparse, use sparse_apply_generic "
+ "instead." );
+
+ // create local copies of the input const pointers
+ OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+ const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+ const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+ const size_t local_n = upper_bound - lower_bound;
+
+ constexpr const size_t block_size = OP::blocksize;
+ const size_t num_blocks = local_n / block_size;
+
+#ifndef NDEBUG
+ const bool has_coda = local_n % block_size > 0;
+#endif
+ size_t i = 0 + lower_bound;
+ const size_t start = 0;
+ const size_t end = num_blocks;
+
+ // declare and initialise local buffers for SIMD
+ OutputType z_b[ block_size ];
+ InputType1 x_b[ block_size ];
+ InputType2 y_b[ block_size ];
+ bool x_m[ block_size ];
+ bool y_m[ block_size ];
+ for( size_t k = 0; k < block_size; ++k ) {
+ if( left_scalar ) {
+ x_b[ k ] = x_wrapper.getValue();
+ }
+ if( right_scalar ) {
+ y_b[ k ] = y_wrapper.getValue();
+ }
+ }
+
+ for( size_t block = start; block < end; ++block ) {
+ size_t local_i = i;
+ for( size_t k = 0; k < block_size; ++k ) {
+ if( !left_scalar ) {
+ x_b[ k ] = x_p[ local_i ];
+ }
+ if( !right_scalar ) {
+ y_b[ k ] = y_p[ local_i ];
+ }
+ if( left_sparse ) {
+ x_m[ k ] = already_dense_input_x || local_x.assigned( local_i -
+ lower_bound );
+ }
+ if( right_sparse ) {
+ y_m[ k ] = already_dense_input_y || local_y.assigned( local_i -
+ lower_bound );
+ }
+ (void) ++local_i;
+ }
+ for( size_t k = 0; k < block_size; ++k ) {
+ RC rc = SUCCESS;
+ if( left_sparse && !x_m[ k ] ) {
+ z_b[ k ] = y_b[ k ]; // WARNING: assumes monoid semantics!
+ } else if( right_sparse && !y_m[ k ] ) {
+ z_b[ k ] = x_b[ k ]; // WARNING: assumes monoid semantics!
+ } else {
+ rc = apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+ }
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ }
+ for( size_t k = 0; k < block_size; ++k, ++i ) {
+ z_p[ i ] = z_b[ k ];
+ }
+ }
+
+#ifndef NDEBUG
+ if( has_coda ) {
+ assert( i < local_n + lower_bound );
+ } else {
+ assert( i == local_n + lower_bound );
+ }
+#endif
+
+ i = end * block_size + lower_bound;
+ for( ; i < local_n + lower_bound; ++i ) {
+ RC rc = SUCCESS;
+ if( left_scalar && right_scalar ) {
+ rc = apply( z_p[ i ], x_wrapper.getValue(), y_wrapper.getValue(), op );
+ } else if( left_scalar && !right_scalar ) {
+ if( right_sparse && !( already_dense_input_y || local_y.assigned( i -
+ lower_bound ) )
+ ) {
+ z_p[ i ] = x_wrapper.getValue();
+ } else {
+ rc = apply( z_p[ i ], x_wrapper.getValue(), y_p[ i ], op );
+ }
+ } else if( !left_scalar && right_scalar ) {
+ if( left_sparse && !( already_dense_input_x || local_x.assigned( i -
+ lower_bound ) )
+ ) {
+ z_p[ i ] = y_wrapper.getValue();
+ } else {
+ rc = apply( z_p[ i ], x_p[ i ], y_wrapper.getValue(), op );
+ }
+ } else {
+ assert( !left_scalar && !right_scalar );
+ if( left_sparse && !(already_dense_input_x || local_x.assigned( i -
+ lower_bound ) )
+ ) {
+ z_p[ i ] = y_p[ i ];
+ } else if( right_sparse && !(already_dense_input_y || local_y.assigned( i -
+ lower_bound ) )
+ ) {
+ z_p[ i ] = x_p[ i ];
+ } else {
+ assert( !left_sparse && !right_sparse );
+ rc = apply( z_p[ i ], x_p[ i ], y_p[ i ], op );
+ }
+ }
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ }
+
+ return SUCCESS;
+ }
+
+ template<
+ bool masked,
+ bool monoid,
+ bool x_scalar,
+ bool y_scalar,
+ Descriptor descr,
+ class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC sparse_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_mask_ptr,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+ const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ ) {
+#ifndef GRB_NO_NOOP_CHECKS
+ static_assert( !internal::maybe_noop< OP >::value, "Warning: you may be "
+ "generating an output vector with uninitialised values. Define "
+ "the GRB_NO_NOOP_CHECKS macro to disable this check.\n" );
+#endif
+ // create local copies of the input const pointers
+ OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+ const MaskType * __restrict__ const mask_p = ( masked )
+ ? internal::getRaw( *mask_vector )
+ : nullptr;
+ const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+ const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+ Coords local_mask;
+ if( masked ) {
+ local_mask = *local_mask_ptr;
+ }
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_x_nz = already_dense_input_x
+ ? local_n
+ : local_x.nonzeroes();
+ const size_t local_y_nz = already_dense_input_y
+ ? local_n
+ : local_y.nonzeroes();
+
+ // assertions
+ assert( !masked || local_mask_ptr != nullptr );
+ assert( !masked || local_mask_ptr->size() == local_n );
+ assert( x_scalar || local_x_nz <= local_n );
+ assert( y_scalar || local_y_nz <= local_n );
+
+#ifdef _DEBUG
+ std::cout << "\tinternal::sparse_apply_generic called\n";
+#endif
+ constexpr const size_t block_size = OP::blocksize;
+
+ // swap so that we do the expensive pass over the container with the fewest
+ // nonzeroes first
+ assert( !x_scalar || !y_scalar );
+ const bool swap = ( ( x_scalar || already_dense_input_x )
+ ? local_n
+ : local_x_nz
+ ) > ( ( y_scalar || already_dense_input_y )
+ ? local_n
+ : local_y_nz
+ );
+ const Coordinates< nonblocking > &loop_coors = swap ? local_y : local_x;
+ const Coordinates< nonblocking > &chk_coors = swap ? local_x : local_y;
+ const bool already_dense_loop = swap
+ ? already_dense_input_y
+ : already_dense_input_x;
+ const bool already_dense_chk = swap
+ ? already_dense_input_x
+ : already_dense_input_y;
+
+ const size_t loop_coors_nz = swap ? local_y_nz : local_x_nz;
+ const size_t chk_coors_nz = swap ? local_x_nz : local_y_nz;
+#ifdef _DEBUG
+ std::cout << "\t\tfirst-phase loop of size " << loop_coors.size() << "\n";
+ if( x_scalar || y_scalar ) {
+ std::cout << "\t\tthere will be no second phase because one of the inputs "
+ << "is scalar\n";
+ } else {
+ std::cout << "\t\tsecond-phase loop of size " << chk_coors.size() << "\n";
+ }
+#endif
+ // declare buffers for vectorisation
+ size_t offsets[ block_size ];
+ OutputType z_b[ block_size ];
+ InputType1 x_b[ block_size ];
+ InputType2 y_b[ block_size ];
+ bool mask[ block_size ];
+ bool x_m[ block_size ];
+ bool y_m[ block_size ];
+
+ if( x_scalar ) {
+ for( size_t k = 0; k < block_size; ++k ) {
+ x_b[ k ] = x_wrapper.getValue();
+ }
+ }
+ if( y_scalar ) {
+ for( size_t k = 0; k < block_size; ++k ) {
+ y_b[ k ] = y_wrapper.getValue();
+ }
+ }
+
+ // expensive pass #1
+ size_t start = 0;
+ size_t end = loop_coors_nz / block_size;
+ size_t k = 0;
+ for( size_t b = start; b < end; ++b ) {
+ // perform gathers
+ for( size_t i = 0; i < block_size; ++i ) {
+ const size_t index = ( already_dense_loop )
+ ? ( ( k++ ) + lower_bound )
+ : ( loop_coors.index( k++ ) + lower_bound );
+ offsets[ i ] = index;
+ assert( index < local_n + lower_bound );
+ if( masked ) {
+ if( already_dense_mask ) {
+ mask[ i ] = internal::getCoordinates( *mask_vector ).template
+ mask< descr >( index, mask_p );
+ } else {
+ mask[ i ] = local_mask.template mask< descr >( index - lower_bound,
+ mask_p + lower_bound );
+ }
+ }
+ }
+ // perform gathers
+ for( size_t i = 0; i < block_size; ++i ) {
+ if( !masked || mask[ i ] ) {
+ if( !x_scalar ) {
+ x_b[ i ] = x_p[ offsets[ i ] ];
+ }
+ if( !x_scalar && !y_scalar ) {
+ y_m[ i ] = already_dense_chk || chk_coors.assigned( offsets[ i ] -
+ lower_bound );
+ } else {
+ y_m[ i ] = true;
+ }
+ if( !y_scalar ) {
+ y_b[ i ] = y_p[ offsets[ i ] ];
+ }
+ } else {
+ y_m[ i ] = false;
+ }
+ }
+ // perform compute
+ for( size_t i = 0; i < block_size; ++i ) {
+ RC rc = SUCCESS;
+ if( y_m[ i ] ) {
+ rc = apply( z_b[ i ], x_b[ i ], y_b[ i ], op );
+ } else if( monoid ) {
+ if( swap ) {
+ z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+ } else {
+ z_b[ i ] = static_cast< typename OP::D3 >( y_b[ i ] );
+ }
+ }
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ }
+ // part that may or may not be vectorised (can we do something about this??)
+ for( size_t i = 0; i < block_size; ++i ) {
+ if( !masked || mask[ i ] ) {
+ if( y_m[ i ] || monoid ) {
+ (void) local_z.assign( offsets[ i ] - lower_bound );
+ }
+ }
+ }
+ // perform scatter
+ for( size_t i = 0; i < block_size; ++i ) {
+ if( !masked || mask[ i ] ) {
+ if( monoid || y_m[ i ] ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+ // an uninitialised value is if the
+ // static_assert at the top of this
+ z_p[ offsets[ i ] ] = z_b[ i ]; // function had triggered. See also
+ GRB_UTIL_RESTORE_WARNINGS // internal issue #321.
+ }
+ }
+ }
+ }
+
+ for( ; k < loop_coors_nz; ++k ) {
+ const size_t index = ( already_dense_loop )
+ ? k + lower_bound
+ : loop_coors.index( k ) + lower_bound;
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+ index, mask_p )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound, mask_p +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+ }
+ RC rc = SUCCESS;
+ (void) local_z.assign( index - lower_bound );
+ if( x_scalar || y_scalar || already_dense_chk || chk_coors.assigned(
+ index - lower_bound )
+ ) {
+ rc = apply(
+ z_p[ index ],
+ ( x_scalar )
+ ? x_wrapper.getValue()
+ : x_p[ index ],
+ ( y_scalar )
+ ? y_wrapper.getValue()
+ : y_p[ index ],
+ op
+ );
+ } else if( monoid ) {
+ if( swap ) {
+ z_p[ index ] = x_scalar ?
+ static_cast< typename OP::D3 >( x_wrapper.getValue() ) :
+ static_cast< typename OP::D3 >( x_p[ index ] );
+ } else {
+ z_p[ index ] = y_scalar ?
+ static_cast< typename OP::D3 >( y_wrapper.getValue() ) :
+ static_cast< typename OP::D3 >( y_p[ index ] );
+ }
+ }
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ }
+
+ // cheaper pass #2, only required if we are using monoid semantics
+ // AND if both inputs are vectors
+ if( monoid && !x_scalar && !y_scalar ) {
+ start = 0;
+ end = chk_coors_nz / block_size;
+ k = 0;
+ for( size_t b = start; b < end; ++b ) {
+ // streaming load
+ for( size_t i = 0; i < block_size; i++ ) {
+ offsets[ i ] = ( already_dense_chk )
+ ? ( ( k++ ) + lower_bound )
+ : ( chk_coors.index( k++ ) + lower_bound );
+ assert( offsets[ i ] < local_n + lower_bound );
+ }
+ // pure gather
+ for( size_t i = 0; i < block_size; i++ ) {
+ x_m[ i ] = already_dense_loop || loop_coors.assigned( offsets[ i ] -
+ lower_bound );
+ }
+ // gather-like
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ if( already_dense_mask ) {
+ mask[ i ] = utils::interpretMask< descr >(
+ internal::getCoordinates( *mask_vector ).assigned( offsets[ i ] ),
+ mask_p, offsets[ i ]
+ );
+ } else {
+ mask[ i ] = utils::interpretMask< descr >(
+ local_mask.assigned( offsets[ i ] - lower_bound ),
+ mask_p, offsets[ i ]
+ );
+ }
+ }
+ }
+ // SIMD
+ for( size_t i = 0; i < block_size; i++ ) {
+ x_m[ i ] = ! x_m[ i ];
+ }
+ // SIMD
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ mask[ i ] = mask[ i ] && x_m[ i ];
+ }
+ }
+ if( !swap ) {
+ // gather
+ for( size_t i = 0; i < block_size; ++i ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ y_b[ i ] = y_p[ offsets[ i ] ];
+ }
+ } else {
+ if( x_m[ i ] ) {
+ y_b[ i ] = y_p[ offsets[ i ] ];
+ }
+ }
+ }
+ // SIMD
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ z_b[ i ] = y_b[ i ];
+ }
+ } else {
+ if( x_m[ i ] ) {
+ z_b[ i ] = y_b[ i ];
+ }
+ }
+ }
+ } else {
+ // gather
+ for( size_t i = 0; i < block_size; ++i ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ x_b[ i ] = x_p[ offsets[ i ] ];
+ }
+ } else {
+ if( x_m[ i ] ) {
+ x_b[ i ] = x_p[ offsets[ i ] ];
+ }
+ }
+ }
+ // SIMD
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+ }
+ } else {
+ if( x_m[ i ] ) {
+ z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] );
+ }
+ }
+ }
+ }
+ // SIMD-like
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ (void)local_z.assign( offsets[ i ] - lower_bound );
+ }
+ } else {
+ if( x_m[ i ] ) {
+ (void)local_z.assign( offsets[ i ] - lower_bound );
+ }
+ }
+ }
+ // scatter
+ for( size_t i = 0; i < block_size; i++ ) {
+ if( masked ) {
+ if( mask[ i ] ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED
+
+ z_p[ offsets[ i ] ] = z_b[ i ];
+
+ GRB_UTIL_RESTORE_WARNINGS
+ }
+ } else {
+ if( x_m[ i ] ) {
+#ifdef _DEBUG
+ std::cout << "\t\t writing out " << z_b[ i ] << " to index "
+ << offsets[ i ] << "\n";
+#endif
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write
+ // an uninitialised value is if the
+ // static_assert at the top of this
+ z_p[ offsets[ i ] ] = z_b[ i ]; // function had triggered. See also
+ GRB_UTIL_RESTORE_WARNINGS // internal issue #321.
+ }
+ }
+ }
+ }
+ for( ; k < chk_coors_nz; ++k ) {
+ const size_t index = ( ( already_dense_chk )
+ ? k
+ : chk_coors.index( k ) ) + lower_bound;
+ assert( index < local_n + lower_bound );
+ if( already_dense_loop || loop_coors.assigned( index - lower_bound) ) {
+ continue;
+ }
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *mask_vector ).template mask< descr >(
+ index, mask_p )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( index - lower_bound , mask_p +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+ }
+ (void) local_z.assign( index - lower_bound );
+ z_p[ index ] = swap ? x_p[ index ] : y_p[ index ];
+ }
+ }
+
+ return SUCCESS;
+ }
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr,
+ class OP,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC masked_apply_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_mask,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &mask_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ const InputType1 * const left_identity,
+ const InputType2 * const right_identity
+#else
+ const InputType1 * const left_identity = nullptr,
+ const InputType2 * const right_identity = nullptr
+#endif
+ ) {
+#ifdef _DEBUG
+ std::cout << "In masked_apply_generic< " << left_scalar << ", "
+ << right_scalar << ", " << left_sparse << ", " << right_sparse << ", "
+ << descr << " > with lower_bound = " << lower_bound << " and upper_bound = "
+ << upper_bound << "\n";
+#endif
+ // assertions
+ static_assert( !(left_scalar && left_sparse),
+ "left_scalar and left_sparse cannot both be set!"
+ );
+ static_assert( !(right_scalar && right_sparse),
+ "right_scalar and right_sparse cannot both be set!"
+ );
+ assert( !left_sparse || left_identity != nullptr );
+ assert( !right_sparse || right_identity != nullptr );
+
+ // create local copies of the input const pointers
+ OutputType * __restrict__ const z_p = internal::getRaw( z_vector );
+ const MaskType * __restrict__ const mask_p = internal::getRaw( mask_vector );
+ const InputType1 * __restrict__ x_p = x_wrapper.getRaw();
+ const InputType2 * __restrict__ y_p = y_wrapper.getRaw();
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_mask_nz = ( already_dense_mask )
+ ? local_n
+ : local_mask.nonzeroes();
+#ifdef _DEBUG
+ std::cout << "\tinternal::masked_apply_generic called with nnz(mask)="
+ << local_mask_nz << " and descriptor " << descr << "\n";
+ if( local_mask_nz > 0 ) {
+ std::cout << "\t\tNonzero mask indices: "
+ << ( already_dense_mask ? 0 : local_mask.index( 0 ) );
+ assert( local_mask.assigned( local_mask.index( 0 ) ) );
+ for( size_t k = 1; k < local_mask_nz; ++k ) {
+ std::cout << ", "
+ << ( ( already_dense_mask ) ? k : local_mask.index( k ) );
+ assert(
+ already_dense_mask ||
+ local_mask.assigned( local_mask.index( k ) )
+ );
+ }
+ std::cout << "\n";
+ }
+
+ size_t unset = 0;
+ for( size_t i = 0; i < local_n; ++i ) {
+ if( !( already_dense_mask || local_mask.assigned( i ) ) ) {
+ (void) ++unset;
+ }
+ }
+ assert( unset == local_n - local_mask_nz );
+#endif
+ // whether to use a Theta(n) or a Theta(nnz(mask)) loop
+ const bool bigLoop = local_mask_nz == local_n ||
+ (descr & descriptors::invert_mask);
+
+ // get block size
+ constexpr size_t size_t_block_size = config::SIMD_SIZE::value() /
+ sizeof( size_t );
+ constexpr size_t op_block_size = OP::blocksize;
+ constexpr size_t min_block_size = op_block_size > size_t_block_size
+ ? size_t_block_size
+ : op_block_size;
+
+ if( bigLoop ) {
+#ifdef _DEBUG
+ std::cerr << "\t in bigLoop variant\n";
+#endif
+ size_t i = 0 + lower_bound;
+
+ constexpr const size_t block_size = op_block_size;
+ const size_t num_blocks = local_n / block_size;
+ const size_t start = 0;
+ const size_t end = num_blocks;
+
+ // declare buffers that fit in a single SIMD register and initialise if
+ // needed
+ bool mask_b[ block_size ];
+ OutputType z_b[ block_size ];
+ InputType1 x_b[ block_size ];
+ InputType2 y_b[ block_size ];
+ for( size_t k = 0; k < block_size; ++k ) {
+ if( left_scalar ) {
+ x_b[ k ] = x_wrapper.getValue();
+ }
+ if( right_scalar ) {
+ y_b[ k ] = y_wrapper.getValue();
+ }
+ }
+ for( size_t b = start; b < end; ++b ) {
+ for( size_t k = 0; k < block_size; ++k ) {
+ const size_t index = i + k;
+ assert( index < local_n + lower_bound );
+ if( already_dense_mask ) {
+ mask_b[ k ] = internal::getCoordinates( mask_vector ).template
+ mask< descr >( index, mask_p );
+ } else {
+ mask_b[ k ] = local_mask.template
+ mask< descr >( index - lower_bound, mask_p + lower_bound );
+ }
+ }
+ // check for no output
+ if( left_sparse && right_sparse ) {
+ for( size_t k = 0; k < block_size; ++k ) {
+ const size_t index = i + k;
+ assert( index < local_n + lower_bound );
+ if( mask_b[ k ] ) {
+ if( !( already_dense_input_x ||
+ local_x.assigned( index - lower_bound )
+ ) && !(
+ already_dense_input_y ||
+ local_y.assigned( index - lower_bound )
+ )
+ ) {
+ mask_b[ k ] = false;
+ }
+ }
+ }
+ }
+ for( size_t k = 0; k < block_size; ++k ) {
+ const size_t index = i + k;
+ assert( index < local_n + lower_bound );
+ if( mask_b[ k ] ) {
+ if( !left_scalar ) {
+ if( left_sparse && !(
+ already_dense_input_x || local_x.assigned( index - lower_bound )
+ ) ) {
+ x_b[ k ] = *left_identity;
+ } else {
+ x_b[ k ] = *( x_p + index );
+ }
+ }
+ if( !right_scalar ) {
+ if( right_sparse && !(
+ already_dense_input_y || local_y.assigned( index - lower_bound )
+ ) ) {
+ y_b[ k ] = *right_identity;
+ } else {
+ y_b[ k ] = *( y_p + index );
+ }
+ }
+ }
+ }
+ for( size_t k = 0; k < block_size; ++k ) {
+ if( mask_b[ k ] ) {
+ apply( z_b[ k ], x_b[ k ], y_b[ k ], op );
+ }
+ }
+ for( size_t k = 0; k < block_size; ++k ) {
+ const size_t index = i + k;
+ assert( index < local_n + lower_bound );
+ if( mask_b[ k ] ) {
+ (void) local_z.assign( index - lower_bound );
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with
+ *( z_p + index ) = z_b[ k ]; // mask_b[ k ], which in the above
+ GRB_UTIL_RESTORE_WARNINGS // loop also triggeres initialising
+ // z_b[ k ]
+ }
+ }
+
+ i += block_size;
+ }
+ // scalar coda
+ for(
+ size_t i = end * block_size + lower_bound;
+ i < local_n + lower_bound;
+ ++i
+ ) {
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( mask_vector ).template mask< descr >( i,
+ mask_p )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >( i - lower_bound, mask_p +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+
+ if( left_sparse && right_sparse ) {
+ if( !( already_dense_input_x || local_x.assigned( i - lower_bound ) ) &&
+ !( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+ ) {
+ continue;
+ }
+ }
+ (void) local_z.assign( i - lower_bound );
+ const InputType1 x_e = left_scalar
+ ? x_wrapper.getValue()
+ : ( (!left_sparse || already_dense_input_x ||
+ local_x.assigned( i - lower_bound ))
+ ? *(x_p + i)
+ : *left_identity
+ );
+ const InputType2 y_e = right_scalar
+ ? y_wrapper.getValue()
+ : ( (!right_sparse || already_dense_input_y ||
+ local_y.assigned( i - lower_bound ))
+ ? *(y_p + i)
+ : *right_identity
+ );
+ OutputType * const z_e = z_p + i;
+ apply( *z_e, x_e, y_e, op );
+ }
+ } else {
+#ifdef _DEBUG
+ std::cerr << "\t in smallLoop variant\n";
+#endif
+ // declare buffers that fit in a single SIMD register and initialise if
+ // needed
+ constexpr const size_t block_size = size_t_block_size > 0
+ ? min_block_size
+ : op_block_size;
+ bool mask_b[ block_size ];
+ OutputType z_b[ block_size ];
+ InputType1 x_b[ block_size ];
+ InputType2 y_b[ block_size ];
+ size_t indices[ block_size ];
+ for( size_t k = 0; k < block_size; ++k ) {
+ if( left_scalar ) {
+ x_b[ k ] = x_wrapper.getValue();
+ }
+ if( right_scalar ) {
+ y_b[ k ] = y_wrapper.getValue();
+ }
+ }
+
+ // loop over mask pattern
+ const size_t mask_nnz = local_mask_nz;
+ const size_t num_blocks = mask_nnz / block_size;
+ const size_t start = 0;
+ const size_t end = num_blocks;
+
+ size_t k = 0;
+
+ // vectorised code
+ for( size_t b = start; b < end; ++b ) {
+ for( size_t t = 0; t < block_size; ++t ) {
+ indices[ t ] = (already_dense_mask ) ? k + t : local_mask.index( k + t );
+ }
+ for( size_t t = 0; t < block_size; ++t ) {
+ if( already_dense_mask ) {
+ mask_b[ t ] = internal::getCoordinates( mask_vector ).template
+ mask< descr >( indices[ t ], mask_p );
+ } else {
+ mask_b[ t ] = local_mask.template
+ mask< descr >( indices[ t ], mask_p + lower_bound );
+ }
+ }
+ for( size_t t = 0; t < block_size; ++t ) {
+ if( mask_b[ t ] ) {
+ if( !left_scalar ) {
+ if( left_sparse && !( already_dense_input_x ||
+ local_x.assigned( indices[ t ] ) )
+ ) {
+ x_b[ t ] = *left_identity;
+ } else {
+ x_b[ t ] = *( x_p + indices[ t ] + lower_bound );
+ }
+ }
+ if( !right_scalar ) {
+ if( right_sparse && !( already_dense_input_y ||
+ local_y.assigned( indices[ t ] ) )
+ ) {
+ y_b[ t ] = *right_identity;
+ } else {
+ y_b[ t ] = *( y_p + indices[ t ] + lower_bound );
+ }
+ }
+ }
+ }
+ // check for no output
+ if( left_sparse && right_sparse ) {
+ for( size_t t = 0; t < block_size; ++t ) {
+ const size_t index = indices[ t ];
+ assert( index < local_n + lower_bound );
+ if( mask_b[ t ] ) {
+ if( !( already_dense_input_x || local_x.assigned( index ) ) &&
+ !( already_dense_input_y || local_y.assigned( index ) )
+ ) {
+ mask_b[ t ] = false;
+ }
+ }
+ }
+ }
+ for( size_t t = 0; t < block_size; ++t ) {
+ if( mask_b[ t ] ) {
+ apply( z_b[ t ], x_b[ t ], y_b[ t ], op );
+ }
+ }
+ for( size_t t = 0; t < block_size; ++t ) {
+ if( mask_b[ t ] ) {
+ (void) local_z.assign( indices[ t ] );
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // z_b is computed from
+ *( z_p + indices[ t ] + lower_bound ) = z_b[ t ]; // x_b and y_b, which
+ GRB_UTIL_RESTORE_WARNINGS // are both initialised
+ // if mask_b is true
+ }
+ }
+
+ k += block_size;
+ }
+
+ // scalar coda
+ for( size_t k = end * block_size; k < mask_nnz; ++k ) {
+ const size_t i = already_dense_mask
+ ? k + lower_bound
+ : local_mask.index( k ) + lower_bound;
+ if( ( already_dense_mask &&
+ internal::getCoordinates( mask_vector ).template mask< descr >(
+ i, mask_p
+ )
+ ) || local_mask.template mask< descr >(
+ i - lower_bound, mask_p + lower_bound
+ )
+ ) {
+ if( left_sparse && right_sparse ) {
+ if( !( already_dense_input_x || local_x.assigned( i - lower_bound ) ) &&
+ !( already_dense_input_y || local_y.assigned( i - lower_bound ) )
+ ) {
+ continue;
+ }
+ }
+ (void) local_z.assign( i - lower_bound );
+ const InputType1 x_e = left_scalar
+ ? x_wrapper.getValue()
+ : (
+ (!left_sparse || already_dense_input_x ||
+ local_x.assigned( i - lower_bound ) )
+ ? *(x_p + i)
+ : *left_identity
+ );
+ const InputType2 y_e = right_scalar
+ ? y_wrapper.getValue()
+ : (
+ (!right_sparse || already_dense_input_y ||
+ local_y.assigned( i - lower_bound ) )
+ ? *(y_p + i)
+ : *right_identity
+ );
+ OutputType * const z_e = z_p + i;
+ apply( *z_e, x_e, y_e, op );
+ }
+ }
+ }
+ return SUCCESS;
+ }
+
+ } // end namespace ``grb::internal''
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n";
+#endif
+ // sanity check
+ auto &z_coors = internal::getCoordinates( z );
+ const size_t n = z_coors.size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&z, &x, beta, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, operator) in "
+ << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input_x = true;
+
+ size_t local_x_nz = local_n;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+ // the global stack counter must be set to 0 unless it's guaranteed
+ // that none of the local_clear and local_assignAll will be invoked
+ // - local_clear is not invoked when the dense descriptor is given,
+ // since the output vector will eventually become dense
+ // - local_assignAll is not invoked when the output vector is already dense
+ // therefore, the following condition relies on global information,
+ // i.e., the dense descriptor and the already_dense_output
+ if( !already_dense_vectors ) {
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ }
+ }
+
+ if( local_x_nz == local_n ) {
+ if( !already_dense_vectors ) {
+ local_z.local_assignAll( );
+ }
+
+ // call dense apply
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+ rc = internal::dense_apply_generic<
+#endif
+ false, true, false, false, descr | descriptors::dense, OP,
+ OutputType, InputType1, InputType2, Coords
+ >(
+ already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_x, local_y,
+ z, x_wrapper, y_wrapper,
+ op
+ );
+ } else {
+ if( !already_dense_vectors ) {
+ local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ }
+
+ // since z and x may not perfectly overlap, and since the intersection is
+ // unknown a priori, we must iterate over the nonzeroes of x
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ false, false, false, true, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ true, already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_z, local_null_mask, local_x, local_y,
+ z, null_mask, x_wrapper, y_wrapper, op
+ );
+ }
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, operator)"
+ << std::endl;
+#endif
+
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+ if( (descr & descriptors::dense) && nnz( z ) < size( z ) ) {
+ return ILLEGAL;
+ }
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ typename OP::D3 val;
+ RC ret = apply< descr >( val, alpha, beta, op );
+ ret = ret ? ret : set< descr >( z, val );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator variant\n";
+#endif
+ // check trivial dispatch
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, beta, op, phase );
+ }
+
+ // dynamic checks
+ if( size( mask ) != size( z ) ) {
+ return MISMATCH;
+ }
+ if( (descr & descriptors::dense) &&
+ ( nnz( z ) < size( z ) || nnz( mask ) < size( mask ) )
+ ) {
+ return ILLEGAL;
+ }
+
+ // check trivial dispatch
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ typename OP::D3 val;
+ RC ret = apply< descr >( val, alpha, beta, op );
+ ret = ret ? ret : set< descr >( z, mask, val );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+ // simply delegate to operator variant
+ return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid variant\n";
+#endif
+ // simply delegate to operator variant
+ return eWiseApply< descr >( z, mask, alpha, beta, monoid.getOperator(),
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using operator)\n";
+#endif
+ // check for empty mask
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, x, beta, op );
+ }
+
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func =
+ [&z, &mask, &x, beta, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, "
+ << "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+ << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_x = true;
+
+ size_t local_mask_nz = local_n;
+ size_t local_x_nz = local_n;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+ if( !mask_is_dense ) {
+ // the output sparsity structure is implied by mask and descr
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+ if(
+ (descr & descriptors::dense) ||
+ (local_x_nz == local_n) ||
+ (local_mask_nz <= local_x_nz)
+ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, true, false, false, descr, OP,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ true, false, false, true, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_z, &local_mask, local_x, local_y,
+ z, &mask, x_wrapper, y_wrapper,
+ op
+ );
+ }
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &mask, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+ << "operator)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch to dense variant
+ if( (descr & descriptors::dense) ) {
+ return eWiseApply< descr >( z, x, y, monoid.getOperator() );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, x, y, monoid) in the "
+ << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ ( void )pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ // we are in the unmasked sparse variant
+ const auto op = monoid.getOperator();
+
+ if( !already_dense_vectors ) {
+ // z will have an a-priori unknown sparsity structure
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ }
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ false, true, false, false, descr, typename Monoid::Operator,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ true, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_null_mask, local_x, local_y,
+ z, null_mask, x_wrapper, y_wrapper,
+ op
+ );
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &y, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ // check if we can dispatch to dense variant
+ if( (descr & descriptors::dense) ) {
+ return eWiseApply< descr >( z, alpha, y, monoid.getOperator() );
+ }
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, monoid) in the "
+ << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ bool already_dense_output = true;
+#endif
+ bool already_dense_input_y = true;
+
+ // when it's guaranteed that the output will become dense
+ // the only criterion to avoid reading the local coordinates is if it the
+ // output is already dense
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( z ) );
+ if( !already_dense_output ) {
+#endif
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ // we are in the unmasked sparse variant
+ const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#endif
+ local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+
+ // dispatch to generic function
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+ rc = internal::dense_apply_generic<
+#endif
+ true, false, false, true, descr, typename Monoid::Operator,
+ OutputType, InputType1, InputType2, Coords
+ >(
+ true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_x, local_y,
+ z, x_wrapper, y_wrapper, op
+ );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+#ifdef _DEBUG
+ std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch to dense variant
+ if( (descr & descriptors::dense) ) {
+ return eWiseApply< descr >( z, x, beta, monoid.getOperator() );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, &x, beta, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, monoid) in the "
+ << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y, local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ bool already_dense_output = true;
+#endif
+ bool already_dense_input_x = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( z ) );
+ if( !already_dense_output ) {
+#endif
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+ // we are in the unmasked sparse variant
+ const auto &op = monoid.getOperator();
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#endif
+ // the result will always be dense
+ local_z.local_assignAllNotAlreadyAssigned();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+
+ // dispatch
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+ rc = internal::dense_apply_generic<
+#endif
+ false, true, true, false, descr, typename Monoid::Operator,
+ OutputType, InputType1, InputType2, Coords
+ >(
+ already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_x, local_y,
+ z, x_wrapper, y_wrapper,
+ op
+ );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( x ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, monoid)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n";
+#endif
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, x, y, monoid, phase );
+ }
+
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch to dense variant
+ if( (descr & descriptors::dense) ) {
+ return eWiseApply< descr >( z, mask, x, y, monoid.getOperator() );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, monoid) in "
+ << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_mask_nz = local_n;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ // we are in the masked sparse variant
+ const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+ const InputType2 right_identity =
+ monoid.template getIdentity< InputType2 >();
+ const auto &op = monoid.getOperator();
+
+ if( !mask_is_dense ) {
+ // z will have an a priori unknown sparsity structure
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+ if( local_x_nz < local_n &&
+ local_y_nz < local_n &&
+ local_x_nz + local_y_nz < local_mask_nz
+ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ true, true, false, false, descr, typename Monoid::Operator,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_mask, local_x, local_y,
+ z, &mask, x_wrapper, y_wrapper,
+ op
+ );
+ } else if( local_x_nz < local_n && local_y_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, false, true, false, descr, typename Monoid::Operator,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op,
+ &left_identity, nullptr
+ );
+ } else if( local_y_nz < local_n && local_x_nz == local_n ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, false, false, true, descr, typename Monoid::Operator,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op,
+ nullptr, &right_identity
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, false, true, true, descr, typename Monoid::Operator,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op,
+ &left_identity, &right_identity
+ );
+ }
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &y, &mask, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ &internal::getCoordinates( mask ), nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+ << "monoid)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n";
+#endif
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, y, monoid );
+ }
+
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch to dense variant
+ if( descr & descriptors::dense ) {
+ return eWiseApply< descr >( z, mask, alpha, y, monoid.getOperator() );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, monoid) "
+ << "in the range(" << lower_bound << ", " << upper_bound << ")"
+ << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_y = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ // we are in the masked sparse variant
+ const InputType2 right_identity =
+ monoid.template getIdentity< InputType2 >();
+ const auto &op = monoid.getOperator();
+
+ if( !mask_is_dense ) {
+ // the sparsity structure of z will be a result of the given mask and descr
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ true, false, false, true, descr, typename Monoid::Operator,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op,
+ nullptr, &right_identity
+ );
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &y, &mask, nullptr, nullptr,
+ &internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+ << "monoid)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Monoid &monoid = Monoid(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< Monoid >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given monoid" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n";
+#endif
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, x, beta, monoid );
+ }
+
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch to dense variant
+ if( (descr & descriptors::dense) ) {
+ return eWiseApply< descr >( z, mask, x, beta, monoid.getOperator() );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, &x, beta, &monoid] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, monoid) "
+ << "in the range(" << lower_bound << ", " << upper_bound << ")"
+ << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_x = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta );
+
+ // we are in the masked sparse variant
+ const InputType1 left_identity = monoid.template getIdentity< InputType1 >();
+ const auto &op = monoid.getOperator();
+
+ if( !mask_is_dense ) {
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, true, true, false, descr, typename Monoid::Operator,
+ OutputType, MaskType, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, true,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op,
+ &left_identity
+ );
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &mask, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( mask ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, "
+ << "monoid)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n";
+#endif
+ // sanity check
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // check if we can dispatch
+ if( static_cast< const void * >( &z ) ==
+ static_cast< const void * >( &y )
+ ) {
+ return foldr< descr >( alpha, z, op );
+ }
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, alpha, &y, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, operator) in "
+ << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ if( !already_dense_vectors ) {
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ }
+ }
+
+ // check for dense variant
+ if( (descr & descriptors::dense) || local_y_nz == local_n ) {
+ if( !already_dense_vectors ) {
+ local_z.local_assignAll( );
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_dense_apply_generic<
+#else
+ rc = internal::dense_apply_generic<
+#endif
+ true, false, false, false, descr, OP,
+ OutputType, InputType1, InputType2, Coords
+ >(
+ true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_x, local_y, z,
+ x_wrapper, y_wrapper,
+ op
+ );
+ } else {
+ if( !already_dense_vectors ) {
+ local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ }
+
+ // we are in the sparse variant
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+ false, false, true, false, descr, OP,
+#else
+ rc = internal::sparse_apply_generic<
+ false, false, true, false, descr, OP,
+#endif
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ true, true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_null_mask, local_x, local_y,
+ z, null_mask, x_wrapper, y_wrapper,
+ op
+ );
+ }
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, "
+ << "operator)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator variant)\n";
+#endif
+ // check for empty mask
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, alpha, y, op );
+ }
+
+ // sanity check
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, "
+ << "operator) in the range(" << lower_bound << ", " << upper_bound << ")"
+ << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_mask_nz = local_n;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_y = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ if( !mask_is_dense ) {
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+ if( (descr & descriptors::dense) ||
+ (local_y_nz == local_n) ||
+ local_mask_nz <= local_y_nz
+ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ true, false, false, false, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ true, false, true, false, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, true, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_mask, local_x, local_y,
+ z, &mask, x_wrapper, y_wrapper,
+ op
+ );
+ }
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &y, &mask, nullptr, nullptr,
+ &internal::getCoordinates( y ), &internal::getCoordinates( mask ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, "
+ << "operator)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+#ifdef _DEBUG
+ std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n";
+#endif
+ // sanity check
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ||
+ internal::getCoordinates( y ).size() != n
+ ) {
+#ifdef _DEBUG
+ std::cerr << "\tinput vectors mismatch in dimensions!\n";
+#endif
+ return MISMATCH;
+ }
+
+ // check for possible shortcuts
+ // trivial dispatch
+ if( n == 0 ) {
+ return SUCCESS;
+ }
+
+ // check for possible shortcuts, after dynamic checks
+ if( getID( x ) == getID( y ) && is_idempotent< OP >::value ) {
+ return set< descr >( z, x, phase );
+ }
+ if( getID( x ) == getID( z ) ) {
+ return foldl< descr >( z, y, op, phase );
+ }
+ if( getID( y ) == getID( z ) ) {
+ return foldr< descr >( x, z, op, phase );
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, &x, &y, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, x, y, operator) in the "
+ << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+ const Coords * const local_null_mask = nullptr;
+
+ Coords local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !already_dense_vectors ) {
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+ }
+ }
+
+ if( sparse ) {
+ if( !already_dense_vectors ) {
+ local_z.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ false, false, false, false, descr | descriptors::dense, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ true, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_null_mask, local_x, local_y,
+ z, null_mask, x_wrapper, y_wrapper,
+ op
+ );
+ } else {
+ if( !already_dense_vectors ) {
+ local_z.local_assignAll( );
+ }
+
+ if( upper_bound > lower_bound ) {
+ const InputType1 * __restrict__ a = internal::getRaw( x );
+ const InputType2 * __restrict__ b = internal::getRaw( y );
+ OutputType * __restrict__ c = internal::getRaw( z );
+
+ // this function is vectorised
+ op.eWiseApply( a + lower_bound, b + lower_bound, c + lower_bound, local_n);
+ }
+ }
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &y, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, operator)"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class OP,
+ typename OutputType, typename MaskType,
+ typename InputType1, typename InputType2,
+ typename Coords
+ >
+ RC eWiseApply(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const OP &op = OP(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< OP >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply",
+ "called with a left-hand input element type that does not match the "
+ "first domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply",
+ "called with a right-hand input element type that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply",
+ "called with an output element type that does not match the "
+ "third domain of the given operator" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::eWiseApply",
+ "called with an output mask element type that is not Boolean " );
+#ifdef _DEBUG
+ std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using operator)\n";
+#endif
+ // check for empty mask
+ if( size( mask ) == 0 ) {
+ return eWiseApply< descr >( z, x, y, op, phase );
+ }
+
+ // other run-time checks
+ const size_t n = internal::getCoordinates( z ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( y ).size() != n ) {
+ return MISMATCH;
+ }
+ if( internal::getCoordinates( mask ).size() != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) && !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &op] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, operator) in "
+ << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y, local_z;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_mask_nz = local_n;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ bool already_dense_mask = true;
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !mask_is_dense ) {
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ if( dense_descr && local_z.nonzeroes() < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType2, Coords > y_wrapper( y );
+
+ const size_t sparse_loop = std::min( local_x_nz, local_y_nz );
+
+ if( !mask_is_dense ) {
+ local_z.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( z ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( z ) );
+ }
+ }
+ }
+
+ if( (descr & descriptors::dense) ||
+ (local_x_nz == local_n && local_y_nz == local_n) ||
+ ( !(descr & descriptors::invert_mask) && sparse_loop >= local_mask_nz )
+ ) {
+ // use loop over mask
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_apply_generic<
+#else
+ rc = internal::masked_apply_generic<
+#endif
+ false, false, false, false, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_mask, local_x, local_y,
+ z, mask, x_wrapper, y_wrapper,
+ op
+ );
+
+ } else {
+ // use loop over sparse inputs
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_apply_generic<
+#else
+ rc = internal::sparse_apply_generic<
+#endif
+ true, false, false, false, descr, OP,
+ OutputType, bool, InputType1, InputType2, Coords
+ >(
+ already_dense_mask, already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_mask, local_x, local_y,
+ z, &mask, x_wrapper, y_wrapper,
+ op
+ );
+ }
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_MASKED_EWISEAPPLY,
+ n, sizeof( OutputType ), dense_descr, dense_mask,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &y, &mask, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ &internal::getCoordinates( mask ), nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, "
+ << "operator)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the fourth domain of the given semiring" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- vector + vector) dispatches to "
+ << "two folds using the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- scalar + vector) dispatches to "
+ << "two folds with the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, alpha, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- vector + scalar) dispatches to "
+ << "two folds with the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, beta, ring.getAdditiveMonoid(), phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- scalar + scalar) dispatches to "
+ << "foldl with precomputed scalar and additive monoid\n";
+#endif
+ const typename Ring::D4 add;
+ (void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+ return foldl< descr >( z, add, ring.getAdditiveMonoid(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the fourth domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ),
+ "grb::eWiseAdd (vector <- vector + vector, masked)",
+ "called with non-bool mask element types" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- vector + vector, masked) "
+ << "dispatches to two folds using the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ),
+ "grb::eWiseAdd (vector <- scalar + vector, masked)",
+ "called with non-bool mask element types" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- scalar + vector, masked) "
+ << "dispatches to two folds using the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, m, alpha, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ),
+ "grb::eWiseAdd (vector <- vector + scalar, masked)",
+ "called with non-bool mask element types" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- vector + scalar, masked) "
+ << "dispatches to eWiseApply using the additive monoid\n";
+#endif
+ RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase );
+ ret = ret ? ret : foldl< descr >( z, m, beta, ring.getAdditiveMonoid(),
+ phase );
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< OutputType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ),
+ "grb::eWiseAdd (vector <- scalar + scalar, masked)",
+ "called with non-bool mask element types" );
+#ifdef _DEBUG
+ std::cout << "eWiseAdd (nonblocking, vector <- scalar + scalar, masked) "
+ << "dispatches to foldl with precomputed scalar and additive monoid\n";
+#endif
+ const typename Ring::D4 add;
+ (void) apply( add, alpha, beta, ring.getAdditiveOperator() );
+ return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase );
+ }
+
+ // declare an internal version of eWiseMulAdd containing the full sparse &
+ // dense implementations
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool a_scalar,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC sparse_eWiseMulAdd_maskDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &m_vector,
+ const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring
+ ) {
+ static_assert( !(descr & descriptors::invert_mask),
+ "Cannot loop over mask nonzeroes if invert_mask is given. "
+ "Please submit a bug report" );
+ static_assert( !a_scalar || !x_scalar,
+ "If both a and x are scalars, this is operation is a simple eWiseApply "
+ "with the additive operator if the semiring." );
+ static_assert( !y_zero || y_scalar,
+ "If y_zero is given, then y_scalar must be given also." );
+
+ OutputType * __restrict__ z = internal::getRaw( z_vector );
+ const MaskType * __restrict__ const m = internal::getRaw( m_vector );
+
+ // create local copies of the input const pointers
+ const InputType1 * __restrict__ const a = a_wrapper.getRaw();
+ const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+ const InputType3 * __restrict__ const y = y_wrapper.getRaw();
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_m_nz = already_dense_mask ? local_n : local_m.nonzeroes();
+
+ const size_t local_start = 0;
+ const size_t local_end = local_m_nz;
+
+ size_t k = local_start;
+
+ // scalar coda and parallel main body
+ for( ; k < local_end; ++k ) {
+ const size_t index = ( already_dense_mask ? k : local_m.index( k ) ) +
+ lower_bound;
+ assert( index - lower_bound < local_n );
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( m_vector ).template mask< descr >(
+ index, m )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_m.template mask< descr >( index - lower_bound, m +
+ lower_bound )
+ ) {
+ continue;
+ }
+ }
+ typename Ring::D3 t = ring.template getZero< typename Ring::D3 >();
+ if(
+ (
+ a_scalar || already_dense_input_a ||
+ local_a.assigned( index - lower_bound )
+ ) && (
+ x_scalar || already_dense_input_x ||
+ local_x.assigned( index - lower_bound)
+ )
+ ) {
+ const InputType1 a_p = ( a_scalar )
+ ? a_wrapper.getValue()
+ : *( a + index );
+ const InputType2 x_p = ( x_scalar )
+ ? x_wrapper.getValue()
+ : *( x + index );
+ (void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+ if( !y_zero && (
+ y_scalar || already_dense_input_y ||
+ local_y.assigned( index - lower_bound ) )
+ ) {
+ const InputType3 y_p = ( y_scalar )
+ ? y_wrapper.getValue()
+ : *( y + index );
+ typename Ring::D4 b;
+ (void) apply( b, t, y_p, ring.getAdditiveOperator() );
+ if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+ typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+ (void) foldr( b, out, ring.getAdditiveOperator() );
+ z[ index ] = static_cast< OutputType >( out );
+ } else {
+ (void) local_z.assign( index - lower_bound );
+ z[ index ] = static_cast< OutputType >( b );
+ }
+ } else if( already_dense_output ||
+ local_z.assigned( index - lower_bound )
+ ) {
+ typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+ (void) foldr( t, out, ring.getAdditiveOperator() );
+ z[ index ] = static_cast< OutputType >( out );
+ } else {
+ (void) local_z.assign( index - lower_bound );
+ z[ index ] = static_cast< OutputType >( t );
+ }
+ } else if( !y_zero && (
+ already_dense_input_y || y_scalar ||
+ local_y.assigned( index - lower_bound ) )
+ ) {
+ if( already_dense_output || local_z.assigned( index - lower_bound ) ) {
+ typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] );
+ (void) foldr( y[ index ], out, ring.getAdditiveOperator() );
+ z[ index ] = static_cast< OutputType >( out );
+ } else {
+ (void)local_z.assign( index - lower_bound );
+ z[ index ] = static_cast< OutputType >( t );
+ }
+ }
+ }
+
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ bool mulSwitched,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const m_vector,
+ const Vector< InputType1, nonblocking, Coords > &a_vector,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring
+ ) {
+ OutputType * __restrict__ z = internal::getRaw( z_vector );
+ const MaskType * __restrict__ const m = masked
+ ? internal::getRaw( *m_vector )
+ : nullptr;
+ const InputType1 * __restrict__ const a = internal::getRaw( a_vector );
+
+ // create local copies of the input const pointers
+ const InputType2 * __restrict__ const x = x_wrapper.getRaw();
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_a_nz = already_dense_input_a
+ ? local_n
+ : local_a.nonzeroes();
+
+ for( size_t i = 0; i < local_a_nz; ++i ) {
+ const size_t index = ( already_dense_input_a ? i : local_a.index( i ) ) +
+ lower_bound;
+ if( masked ) {
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( *m_vector ).template mask< descr >(
+ index, m )
+ ) {
+ continue;
+ }
+ } else {
+ if( !local_m->template mask< descr >( index - lower_bound,
+ m + lower_bound )
+ ) {
+ continue;
+ }
+ }
+ }
+
+ if( x_scalar || already_dense_input_x ||
+ local_x.assigned( index - lower_bound )
+ ) {
+ typename Ring::D3 t;
+ const InputType1 a_p = *( a + index );
+ const InputType2 x_p = ( x_scalar )
+ ? x_wrapper.getValue()
+ : *( x + index );
+
+ if( mulSwitched ) {
+ (void) apply( t, x_p, a_p, ring.getMultiplicativeOperator() );
+ } else {
+ (void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() );
+ }
+
+ if( already_dense_output || local_z.assign( index - lower_bound ) ) {
+ typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] );
+ (void) foldr( t, b, ring.getAdditiveOperator() );
+ z[ index ] = static_cast< OutputType >( b );
+ } else {
+ z[ index ] = static_cast< OutputType >(
+ static_cast< typename Ring::D4 >( t )
+ );
+ }
+ }
+ }
+
+ RC rc = SUCCESS;
+
+ // now handle addition
+ if( !y_zero ) {
+ // now handle addition
+ if( masked ) {
+ if( y_scalar ) {
+ rc = internal::fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+ descr, true, true, true, true,
+ already_dense_output, already_dense_mask
+#else
+ descr, true, true, true, true
+#endif
+ >(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ already_dense_output, already_dense_mask,
+#endif
+ lower_bound, upper_bound, local_z, local_m,
+ z_vector, m_vector, y_wrapper.getValue(),
+ ring.getAdditiveMonoid().getOperator(), EXECUTE
+ );
+ } else {
+ rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+ descr, true, true, true, true,
+ already_dense_output, already_dense_input_y, already_dense_mask
+#else
+ descr, true, true, true, true
+#endif
+ >(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+ lower_bound, upper_bound,
+ local_z, local_m, local_y,
+ z_vector, m_vector, *( y_wrapper.getPointer() ),
+ ring.getAdditiveMonoid().getOperator(), EXECUTE
+ );
+ }
+ } else {
+ if( y_scalar ) {
+ rc = fold_from_scalar_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+ descr, true, true, false, true,
+ already_dense_output, already_dense_mask
+#else
+ descr, true, true, false, true
+#endif
+ >(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ already_dense_output, already_dense_mask,
+#endif
+ lower_bound, upper_bound,
+ local_z, local_m,
+ z_vector, m_vector, y_wrapper.getValue(),
+ ring.getAdditiveMonoid().getOperator(), EXECUTE
+ );
+ } else {
+ rc = fold_from_vector_to_vector_generic<
+#ifdef GRB_BOOLEAN_DISPATCHER
+ descr, true, true, false, true,
+ already_dense_output, already_dense_input_y, already_dense_mask
+#else
+ descr, true, true, false, true
+#endif
+ >(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ already_dense_output, already_dense_input_y, already_dense_mask,
+#endif
+ lower_bound, upper_bound,
+ local_z, local_m, local_y,
+ z_vector, m_vector, *( y_wrapper.getPointer() ),
+ ring.getAdditiveMonoid().getOperator(), EXECUTE
+ );
+ }
+ }
+ }
+
+ // done
+ return rc;
+ }
+
+ template<
+ Descriptor descr,
+ bool a_scalar,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ bool assign_z,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC dense_eWiseMulAdd(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring = Ring()
+ ) {
+#ifdef _DEBUG
+ std::cout << "\tdense_eWiseMulAdd: loop size will be "
+ << (upper_bound - lower_bound) << " in the range(" << lower_bound << ", "
+ << upper_bound << ")\n";
+#endif
+ const size_t start = lower_bound;
+ const size_t end = upper_bound;
+
+ OutputType * __restrict__ z = internal::getRaw( z_vector );
+
+ // create local copies of the input const pointers
+ const InputType1 * __restrict__ a = a_wrapper.getRaw();
+ const InputType2 * __restrict__ x = x_wrapper.getRaw();
+ const InputType3 * __restrict__ y = y_wrapper.getRaw();
+
+ assert( z != a );
+ assert( z != x );
+ assert( z != y );
+ assert( a != x || a == nullptr );
+ assert( a != y || a == nullptr );
+ assert( x != y || x == nullptr );
+
+ // vector registers
+ typename Ring::D1 aa[ Ring::blocksize ];
+ typename Ring::D2 xx[ Ring::blocksize ];
+ typename Ring::D3 tt[ Ring::blocksize ];
+ typename Ring::D4 bb[ Ring::blocksize ];
+ typename Ring::D4 yy[ Ring::blocksize ];
+ typename Ring::D4 zz[ Ring::blocksize ];
+
+ if( a_scalar ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ aa[ b ] = a_wrapper.getValue();
+ }
+ }
+ if( x_scalar ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ xx[ b ] = x_wrapper.getValue();
+ }
+ }
+ if( y_scalar ) {
+ if( y_zero ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ yy[ b ] = ring.template getZero< typename Ring::D4 >();
+ }
+ } else {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ yy[ b ] = y_wrapper.getValue();
+ }
+ }
+ }
+
+ // do vectorised out-of-place operations. Allows for aligned overlap.
+ // Non-aligned ovelap is not possible due to GraphBLAS semantics.
+ size_t i = start;
+ // note: read the tail code (under this while loop) comments first for
+ // greater understanding
+ while( i + Ring::blocksize <= end ) {
+#ifdef _DEBUG
+ std::cout << "\tdense_eWiseMulAdd: handling block of size "
+ << Ring::blocksize << " starting at index " << i << "\n";
+#endif
+ // read-in
+ if( !a_scalar ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ aa[ b ] = static_cast< typename Ring::D2 >( a[ i + b ] );
+ }
+ }
+ if( !x_scalar ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ xx[ b ] = static_cast< typename Ring::D2 >( x[ i + b ] );
+ }
+ }
+ if( !y_scalar ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] );
+ }
+ }
+ if( !assign_z ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] );
+ }
+ }
+
+ // operate
+ if( !y_zero ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+ apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() );
+ }
+ } else {
+ assert( y_scalar );
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ apply( bb[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() );
+ }
+ }
+ if( !assign_z ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b ) {
+ foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() );
+ }
+ }
+
+ // write-out
+ if( assign_z ) {
+ for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+ z[ i ] = static_cast< OutputType >( bb[ b ] );
+ }
+ } else {
+ for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) {
+ z[ i ] = static_cast< OutputType >( zz[ b ] );
+ }
+ }
+ }
+
+ // perform tail
+ if( !a_scalar ) {
+ a += i;
+ }
+ if( !x_scalar ) {
+ x += i;
+ }
+ if( !y_scalar ) {
+ y += i;
+ }
+ z += i;
+ for( ; i < end; ++i ) {
+ // do multiply
+ const typename Ring::D1 &as = ( a_scalar )
+ ? static_cast< typename Ring::D1 >( a_wrapper.getValue() )
+ : static_cast< typename Ring::D1 >( *a );
+ const typename Ring::D2 &xs = ( x_scalar )
+ ? static_cast< typename Ring::D2 >( x_wrapper.getValue() )
+ : static_cast< typename Ring::D2 >( *x );
+ typename Ring::D4 ys = ( y_scalar )
+ ? static_cast< typename Ring::D4 >( y_wrapper.getValue() )
+ : static_cast< typename Ring::D4 >( *y );
+ typename Ring::D3 ts;
+
+ if( !y_zero ) {
+ RC always_succeeds = apply( ts, as, xs, ring.getMultiplicativeOperator() );
+ assert( always_succeeds == SUCCESS );
+ always_succeeds = foldr( ts, ys, ring.getAdditiveOperator() );
+ assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+ (void) always_succeeds;
+#endif
+ } else {
+ RC always_succeeds = apply( ys, as, xs, ring.getMultiplicativeOperator() );
+ assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+ (void) always_succeeds;
+#endif
+ }
+
+ // write out
+ if( assign_z ) {
+ *z = static_cast< OutputType >( ys );
+ } else {
+ RC always_succeeds = foldr( ys, *z, ring.getAdditiveOperator() );
+ assert( always_succeeds == SUCCESS );
+#ifdef NDEBUG
+ (void) always_succeeds;
+#endif
+ }
+
+ // move pointers
+ if( !a_scalar ) {
+ (void)a++;
+ }
+ if( !x_scalar ) {
+ (void)x++;
+ }
+ if( !y_scalar ) {
+ (void)y++;
+ }
+ (void)z++;
+ }
+
+ // done
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool a_scalar,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ typename MaskType,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd_dispatch(
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const m_vector,
+ const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const size_t n,
+ const Ring &ring
+ ) {
+ static_assert( !y_zero || y_scalar, "If y is zero, y_scalar must be true. "
+ "Triggering this assertion indicates an incorrect call to this "
+ "function; please submit a bug report" );
+#ifdef _DEBUG
+ std::cout << "\t in eWiseMulAdd_dispatch\n";
+#endif
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, &ring] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseMulAdd_dispatch in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_z, local_m, local_a, local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_z_nz = local_n;
+ size_t local_m_nz = local_n;
+ size_t local_a_nz = local_n;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_output = true;
+ bool already_dense_mask = true;
+ bool already_dense_input_a = true;
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( z_vector ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_z = internal::getCoordinates( z_vector ).asyncSubset( lower_bound,
+ upper_bound );
+ local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( *m_vector ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_m = internal::getCoordinates( *m_vector ).asyncSubset(
+ lower_bound, upper_bound );
+ local_m_nz = local_m.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !a_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_a = pipeline.containsAlreadyDenseVector(
+ a_wrapper.getCoordinates() );
+ if ( !already_dense_input_a ) {
+#else
+ already_dense_input_a = false;
+#endif
+ local_a = a_wrapper.getCoordinates()->asyncSubset( lower_bound,
+ upper_bound );
+ local_a_nz = local_a.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !x_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ x_wrapper.getCoordinates() );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = x_wrapper.getCoordinates()->asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !y_scalar ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ y_wrapper.getCoordinates() );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = y_wrapper.getCoordinates()->asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+ }
+
+ // check whether we are in the sparse or dense case
+ const bool mask_is_dense = !masked || (
+ (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) &&
+ local_m_nz == local_n
+ );
+ const size_t z_nns = local_z_nz;
+
+ // the below Boolean shall be true only if the inputs a, x, and y generate
+ // a dense output vector. It furthermore shall be set to false only if the
+ // output vector was either empty or fully dense. This is done to determine
+ // the exact case the dense variant of the eWiseMulAdd implementations can
+ // be used.
+ const bool sparse = ( a_scalar ? false : ( local_a_nz < local_n ) ) ||
+ ( x_scalar ? false : ( local_x_nz < local_n ) ) ||
+ ( y_scalar ? false : ( local_y_nz < local_n ) ) ||
+ ( z_nns > 0 && z_nns < local_n ) ||
+ ( masked && !mask_is_dense );
+ assert( !(sparse && dense_descr) );
+#ifdef _DEBUG
+ std::cout << "\t\t (sparse, dense)=(" << sparse << ", " << dense_descr
+ << ")\n";
+#endif
+ // pre-assign coors if output is dense but was previously totally empty
+ const bool assign_z = z_nns == 0 && !sparse;
+
+ if( assign_z ) {
+#ifdef _DEBUG
+ std::cout << "\t\t detected output will be dense while "
+ << "the output vector presently is completely empty. We therefore "
+ << "pre-assign all output coordinates\n";
+#endif
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#endif
+ // the result will always be dense
+ local_z.local_assignAllNotAlreadyAssigned();
+ local_z_nz = local_z.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !dense_descr && sparse ) {
+ // the below computes loop sizes multiplied with the number of vectors that
+ // each loop needs to touch. Possible vectors are: z, m, a, x, and y.
+ const size_t mask_factor = masked ? 1 : 0;
+ const size_t mul_loop_size = ( 3 + mask_factor ) * std::min(
+ ( a_scalar ? local_n : local_a_nz ),
+ ( x_scalar ? local_n : local_x_nz )
+ ) + ( y_zero ? 0 :
+ (2 + mask_factor) * ( y_scalar ? local_n : local_y_nz )
+ );
+#ifdef _DEBUG
+ std::cout << "\t\t mul_loop_size = " << mul_loop_size << "\n";
+#endif
+
+ const size_t mask_loop_size = ( y_zero ? 4 : 5 ) * local_m_nz;
+
+ if( masked && mask_loop_size < mul_loop_size ) {
+#ifdef _DEBUG
+ std::cout << "\t\t mask_loop_size= " << mask_loop_size << "\n";
+ std::cout << "\t\t will be driven by output mask\n";
+#endif
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = boolean_dispatcher_sparse_eWiseMulAdd_maskDriven<
+#else
+ rc = sparse_eWiseMulAdd_maskDriven<
+#endif
+ descr, a_scalar, x_scalar, y_scalar, y_zero
+ >(
+ already_dense_output, already_dense_mask, already_dense_input_a,
+ already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, local_m, local_a, local_x, local_y,
+ z_vector, *m_vector, a_wrapper, x_wrapper, y_wrapper,
+ ring
+ );
+ } else {
+#ifdef _DEBUG
+ std::cout << "\t\t will be driven by the multiplication a*x\n";
+#endif
+ static_assert( !(a_scalar && x_scalar),
+ "The case of the multiplication being between two scalars should have"
+ "been caught earlier. Please submit a bug report." );
+
+ if( a_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+ rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+ descr, masked, a_scalar, y_scalar, y_zero, true
+ >(
+ already_dense_output, already_dense_mask, already_dense_input_x,
+ already_dense_input_a, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_m, local_x, local_a, local_y,
+ z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+ ring
+ );
+ } else if( x_scalar ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+ rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+ descr, masked, x_scalar, y_scalar, y_zero, false
+ >(
+ already_dense_output, already_dense_mask, already_dense_input_a,
+ already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_m, local_a, local_x, local_y,
+ z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+ ring
+ );
+ } else if( local_a_nz <= local_x_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+ rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+ descr, masked, x_scalar, y_scalar, y_zero, false
+ >(
+ already_dense_output, already_dense_mask, already_dense_input_a,
+ already_dense_input_x, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_m, local_a, local_x, local_y,
+ z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper,
+ ring
+ );
+ } else {
+ assert( local_x_nz < local_a_nz );
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven<
+#else
+ rc = twoPhase_sparse_eWiseMulAdd_mulDriven<
+#endif
+ descr, masked, a_scalar, y_scalar, y_zero, true
+ >(
+ already_dense_output, already_dense_mask, already_dense_input_x,
+ already_dense_input_a, already_dense_input_y,
+ lower_bound, upper_bound,
+ local_z, &local_m, local_x, local_a, local_y,
+ z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper,
+ ring
+ );
+ }
+ }
+ } else {
+ // all that remains is the dense case
+ assert( a_scalar || local_a_nz == local_n );
+ assert( x_scalar || local_x_nz == local_n );
+ assert( y_scalar || local_y_nz == local_n );
+ assert( ! masked || mask_is_dense );
+ assert( local_z_nz == local_n );
+#ifdef _DEBUG
+ std::cout << "\t\t will perform a dense eWiseMulAdd\n";
+#endif
+ if( assign_z ) {
+ rc = dense_eWiseMulAdd<
+ descr, a_scalar, x_scalar, y_scalar, y_zero, true
+ >(
+ lower_bound, upper_bound,
+ z_vector, a_wrapper, x_wrapper, y_wrapper,
+ ring
+ );
+ } else {
+ rc = dense_eWiseMulAdd<
+ descr, a_scalar, x_scalar, y_scalar, y_zero, false
+ >(
+ lower_bound, upper_bound,
+ z_vector, a_wrapper, x_wrapper, y_wrapper,
+ ring
+ );
+ }
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( z_vector ).asyncJoinSubset( local_z,
+ lower_bound, upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEMULADD_DISPATCH,
+ n, sizeof( OutputType ), dense_descr, true,
+ &z_vector, nullptr, &internal::getCoordinates( z_vector ), nullptr,
+ masked ? m_vector : nullptr, a_wrapper.getPointer(),
+ x_wrapper.getPointer(), y_wrapper.getPointer(),
+ masked ? &internal::getCoordinates( *m_vector ) : nullptr,
+ a_wrapper.getCoordinates(), x_wrapper.getCoordinates(),
+ y_wrapper.getCoordinates(),
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseMulAdd_dispatch"
+ << std::endl;
+#endif
+ return ret;
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( y ) != n ) {
+ return MISMATCH;
+ }
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial cases
+ const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+ if( alpha == zeroIT1 ) {
+ return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+ }
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+ const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, true, false, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const InputType2 chi,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n || size( y ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial cases
+ const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+ if( chi == zeroIT2 ) {
+ return foldl< descr >( z, y, ring.getAdditiveMonoid() );
+ }
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, false, true, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n || size( x ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, false, false, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial dispatches
+ const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+ if( beta == zeroIT2 ) {
+ return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+ }
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, false, true, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial cases
+ const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+ if( alpha == zeroIT1 ) {
+ return foldl< descr >( z, gamma, ring.getAdditiveMonoid() );
+ }
+
+ const Vector< bool, nonblocking, Coords > * null_mask = nullptr;
+
+ const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, true, false, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "First domain of semiring does not match first input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Second domain of semiring does not match second input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match third input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + vector) "
+ << "precomputes scalar multiply and dispatches to eWiseAdd (nonblocking, "
+ << "vector <- scalar + vector)\n";
+#endif
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( y ) != n ) { return MISMATCH; }
+
+ typename Ring::D3 mul_result;
+ RC rc = grb::apply( mul_result, alpha, beta,
+ ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+ (void) rc;
+#else
+ assert( rc == SUCCESS );
+#endif
+ return eWiseAdd< descr >( z, mul_result, y, ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "First domain of semiring does not match first input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Second domain of semiring does not match second input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match third input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match output type" );
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + scalar) "
+ << "precomputes scalar operations and dispatches to set (nonblocking)\n";
+#endif
+ typename Ring::D3 mul_result;
+ RC rc = grb::apply( mul_result, alpha, beta,
+ ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ assert( rc == SUCCESS );
+ typename Ring::D4 add_result;
+ rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ assert( rc == SUCCESS );
+ return grb::foldl< descr >( z, add_result, ring.getAdditiveMonoid(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ (void) ring;
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand vector a with an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( y ) != n || size( a ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ const Vector< bool, nonblocking, Coords > * const null_mask = nullptr;
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, false, false, false, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( y ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial cases
+ const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+ if( alpha == zeroIT1 ) {
+ return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, true, false, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const InputType2 chi,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, chi, y, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n || size( y ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial cases
+ const InputType1 zeroIT2 = ring.template getZero< InputType2 >();
+ if( chi == zeroIT2 ) {
+ return foldl< descr >( z, m, y, ring.getAdditiveMonoid() );
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, false, true, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value, void
+ >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr, y_zero >( z, a, x, gamma, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n || size( x ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, false, false, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr, y_zero >( z, a, beta, gamma, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( a ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial dispatch
+ const InputType2 zeroIT2 = ring.template getZero< InputType2 >();
+ if( zeroIT2 == beta ) {
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, masked, vector<-vector<-scalar<-"
+ << "scalar) dispatches to foldl\n";
+#endif
+ return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+ }
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, false, true, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool y_zero = false,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand scalar alpha of an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr, y_zero >( z, alpha, x, gamma, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // catch trivial dispatch
+ const InputType1 zeroIT1 = ring.template getZero< InputType1 >();
+ if( alpha == zeroIT1 ) {
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, masked, vector<-scalar<-scalar<-"
+ << "scalar) dispatches to foldl\n";
+#endif
+ return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() );
+ }
+
+ const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, true, false, true, y_zero,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &a,
+ const Vector< InputType2, nonblocking, Coords > &x,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a left-hand vector a with an element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd",
+ "called with a right-hand vector x with an element type that does not "
+ "match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd",
+ "called with an additive vector y with an element type that does not "
+ "match the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a result vector z with an element type that does not match "
+ "the fourth domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector m with a non-bool element type" );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMulAdd< descr >( z, a, x, y, ring, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( y ) != n || size( a ) != n || size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ const internal::Wrapper< false, InputType1, Coords > a_wrapper( a );
+ const internal::Wrapper< false, InputType2, Coords > x_wrapper( x );
+ const internal::Wrapper< false, InputType3, Coords > y_wrapper( y );
+
+ // sparse or dense case
+ return internal::eWiseMulAdd_dispatch<
+ descr, true, false, false, false, false,
+ bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords
+ >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Vector< InputType3, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value &&
+ !grb::is_object< MaskType >::value, void
+ >::type * const = nullptr
+ ) {
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "First domain of semiring does not match first input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Second domain of semiring does not match second input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match third input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match output type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + vector, "
+ << "masked) precomputes scalar multiply and dispatches to eWiseAdd "
+ << "(nonblocking, vector <- scalar + vector, masked)\n";
+#endif
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n || size( y ) != n ) {
+ return MISMATCH;
+ }
+
+ typename Ring::D3 mul_result;
+ RC rc = grb::apply( mul_result, alpha, beta,
+ ring.getMultiplicativeOperator() );
+#ifdef NDEBUG
+ (void) rc;
+#else
+ assert( rc == SUCCESS );
+#endif
+ return grb::eWiseAdd< descr >( z, m, mul_result, y, ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC eWiseMulAdd(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const InputType3 gamma,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "First domain of semiring does not match first input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Second domain of semiring does not match second input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, InputType3 >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match third input type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D4, OutputType >::value ),
+ "grb::eWiseMulAdd(vector,scalar,scalar,scalar)",
+ "Fourth domain of semiring does not match output type" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector with a non-bool element type" );
+#ifdef _DEBUG
+ std::cout << "eWiseMulAdd (nonblocking, vector <- scalar x scalar + scalar, "
+ << "masked) precomputes scalar operations and dispatches to foldl "
+ << "(nonblocking, masked)\n";
+#endif
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n ) {
+ return MISMATCH;
+ }
+
+ typename Ring::D3 mul_result;
+ RC rc = grb::apply( mul_result, alpha, beta,
+ ring.getMultiplicativeOperator() );
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ typename Ring::D4 add_result;
+ rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() );
+ assert( rc == SUCCESS );
+#ifdef NDEBUG
+ (void) rc;
+#endif
+ return grb::foldl( z, m, add_result, ring.getAdditiveMonoid(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring & ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( x ) != n || size( y ) != n ) {
+ return MISMATCH;
+ }
+
+ // check trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, vector <- vector x vector) dispatches "
+ << "to eWiseMulAdd (vector <- vector x vector + 0)\n";
+#endif
+ return eWiseMulAdd< descr, true >(
+ z, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( y ) != n ) { return MISMATCH; }
+
+ // check for trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, vector <- scalar x vector) dispatches "
+ << "to eWiseMulAdd (vector <- scalar x vector + 0)\n";
+#endif
+ return eWiseMulAdd< descr, true >(
+ z, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( x ) != n ) {
+ return MISMATCH;
+ }
+
+ // catch trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( beta == ring.template getZero< typename Ring::D2 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking) dispatches to eWiseMulAdd with 0.0 as "
+ << "additive scalar\n";
+#endif
+
+ return eWiseMulAdd< descr, true >(
+ z, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+
+ // check for trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+ return SUCCESS;
+ }
+ if( beta == ring.template getZero< typename Ring::D2 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking) dispatches to scalar apply and foldl\n";
+#endif
+ typename Ring::D3 temp;
+ RC always_success = apply( temp, alpha, beta,
+ ring.getMultiplicativeOperator() );
+ assert( always_success == SUCCESS );
+#ifdef NDEBUG
+ (void) always_success;
+#endif
+ return foldl< descr >( z, temp, ring.getAdditiveMonoid(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector with a non-bool element type" );
+
+ // check for empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, x, y, ring, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n || size( x ) != n || size( y ) != n ) {
+ return MISMATCH;
+ }
+
+ // check trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, vector <- vector x vector, masked) "
+ << "dispatches to eWiseMulAdd (vector <- vector x vector + 0, masked)\n";
+#endif
+ return eWiseMulAdd< descr, true >(
+ z, m, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check for empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, y, ring, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n || size( y ) != n ) { return MISMATCH; }
+
+ // check for trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, vector <- scalar x vector, masked) "
+ << "dispatches to eWiseMulAdd (vector <- scalar x vector + 0, masked)\n";
+#endif
+ return eWiseMulAdd< descr, true >(
+ z, m, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check for empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, x, beta, ring, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n || size( x ) != n ) { return MISMATCH; }
+
+ // check for trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( beta == ring.template getZero< typename Ring::D2 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, masked) dispatches to masked "
+ << "eWiseMulAdd with 0.0 as additive scalar\n";
+#endif
+ return eWiseMulAdd< descr, true >(
+ z, m, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename MaskType,
+ typename Coords
+ >
+ RC eWiseMul(
+ Vector< OutputType, nonblocking, Coords > &z,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const InputType1 alpha,
+ const InputType2 beta,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D1, InputType1 >::value ),
+ "grb::eWiseMul",
+ "called with a left-hand side input vector with element type that does not "
+ "match the first domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D2, InputType2 >::value ),
+ "grb::eWiseMul",
+ "called with a right-hand side input vector with element type that does "
+ "not match the second domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Ring::D3, OutputType >::value ),
+ "grb::eWiseMul",
+ "called with an output vector with element type that does not match the "
+ "third domain of the given semiring" );
+ NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ),
+ "grb::eWiseMulAdd",
+ "called with a mask vector _m with a non-bool element type" );
+
+ // check for empty mask
+ if( size( m ) == 0 ) {
+ return eWiseMul< descr >( z, alpha, beta, ring, phase );
+ }
+
+ // dynamic checks
+ const size_t n = size( z );
+ if( size( m ) != n ) { return MISMATCH; }
+
+ // check for trivial phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // check trivial
+ if( alpha == ring.template getZero< typename Ring::D1 >() ) {
+ return SUCCESS;
+ }
+ if( beta == ring.template getZero< typename Ring::D2 >() ) {
+ return SUCCESS;
+ }
+
+#ifdef _DEBUG
+ std::cout << "eWiseMul (nonblocking, masked) dispatches to masked foldl\n";
+#endif
+ typename Ring::D3 temp;
+ const RC always_success = apply( temp, alpha, beta,
+ ring.getMultiplicativeOperator() );
+ assert( always_success == SUCCESS );
+#ifdef NDEBUG
+ (void) always_success;
+#endif
+ return foldl< descr >( z, m, temp, ring.getAdditiveMonoid(), EXECUTE );
+ }
+
+ // internal namespace for implementation of grb::dot
+ namespace internal {
+
+ template<
+ Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ class AddMonoid,
+ class AnyOp,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC sparse_dot_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+#endif
+ typename AddMonoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const size_t local_nz,
+ const AddMonoid &addMonoid,
+ const AnyOp &anyOp
+ ) {
+#ifdef _DEBUG
+ std::cout << "\t\t in sparse variant, nonzero range " << lower_bound << "--"
+ << upper_bound << ", blocksize " << AnyOp::blocksize << "\n";
+#else
+ (void) upper_bound;
+#endif
+
+ // get raw alias
+ const InputType1 * __restrict__ a = internal::getRaw( x );
+ const InputType2 * __restrict__ b = internal::getRaw( y );
+
+ size_t i = 0;
+ if( local_nz > 0 ) {
+ while( i + AnyOp::blocksize < local_nz ) {
+ // declare buffers
+ static_assert( AnyOp::blocksize > 0,
+ "Configuration error: vectorisation blocksize set to 0!" );
+ typename AnyOp::D1 xx[ AnyOp::blocksize ];
+ typename AnyOp::D2 yy[ AnyOp::blocksize ];
+ typename AnyOp::D3 zz[ AnyOp::blocksize ];
+ bool mask[ AnyOp::blocksize ];
+
+ // prepare registers
+ for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+ mask[ k ] = already_dense_input_x ||
+ local_x.assigned( already_dense_input_y ? i : local_y.index( i ) );
+ }
+
+ // rewind
+ i -= AnyOp::blocksize;
+
+ // do masked load
+ for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) {
+ if( mask[ k ] ) {
+ xx[ k ] = static_cast< typename AnyOp::D1 >(
+ a[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+ yy[ k ] = static_cast< typename AnyOp::D2 >(
+ b[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] );
+ }
+ }
+
+ // perform element-wise multiplication
+ if( internal::maybe_noop< AnyOp >::value ) {
+ // we are forced to first initialise zz before doing masked apply
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+ }
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ if( mask[ k ] ) {
+ GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // yy and xx cannot be used
+ // uninitialised or mask
+ apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); // would be false while zz
+ GRB_UTIL_RESTORE_WARNINGS // init is just above
+ }
+ }
+ } else {
+ // if apply surely initialises zz, we could use a blend-like op
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ if( mask[ k ] ) {
+ apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+ } else {
+ zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+ }
+ }
+ }
+
+ // perform reduction into output element
+ addMonoid.getOperator().foldlArray( thread_local_output, zz,
+ AnyOp::blocksize );
+ //^--> note that this foldl operates on raw arrays,
+ // and thus should not be mistaken with a foldl
+ // on a grb::Vector.
+ }
+
+ // perform element-by-element updates for remainder (if any)
+ for( ; i < local_nz; ++i ) {
+ typename AddMonoid::D3 temp =
+ addMonoid.template getIdentity< typename AddMonoid::D3 >();
+ const size_t index = ( already_dense_input_y ? i : local_y.index( i ) ) +
+ lower_bound;
+ if( already_dense_input_x || local_x.assigned( index - lower_bound ) ) {
+ apply( temp, a[ index ], b[ index ], anyOp );
+ foldr( temp, thread_local_output, addMonoid.getOperator() );
+ }
+ }
+ }
+
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AddMonoid,
+ class AnyOp,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC dot_generic(
+ OutputType &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const AddMonoid &addMonoid,
+ const AnyOp &anyOp,
+ const Phase &phase
+ ) {
+ const size_t n = internal::getCoordinates( x ).size();
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ const size_t start = 0;
+ const size_t end = n;
+
+ if( end > start ) {
+
+ typename AddMonoid::D3 reduced =
+ addMonoid.template getIdentity< typename AddMonoid::D3 >();
+
+ size_t reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) *
+ config::CACHE_LINE_SIZE::value();
+ typename AddMonoid::D3 array_reduced[ reduced_size ];
+
+ for(
+ size_t i = 0;
+ i < reduced_size;
+ i += config::CACHE_LINE_SIZE::value()
+ ) {
+ array_reduced[ i ] =
+ addMonoid.template getIdentity< typename AddMonoid::D3 >();
+ }
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func =
+ [&x, &y, &addMonoid, &anyOp, &array_reduced] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage dot-generic in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+ bool already_dense_input_x = true;
+ bool already_dense_input_y = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset(
+ lower_bound, upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ unsigned int thread_id =
+ omp_get_thread_num() * config::CACHE_LINE_SIZE::value();
+
+ if( sparse ) {
+ if( local_x_nz < local_y_nz ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+ rc = internal::sparse_dot_generic<
+#endif
+ descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+ >(
+ already_dense_input_x, already_dense_input_y,
+ array_reduced[ thread_id ],
+ lower_bound, upper_bound,
+ local_x, local_y,
+ x, y,
+ local_x_nz,
+ addMonoid, anyOp
+ );
+ } else {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_sparse_dot_generic<
+#else
+ rc = internal::sparse_dot_generic<
+#endif
+ descr, AddMonoid, AnyOp, InputType1, InputType2, Coords
+ >(
+ already_dense_input_y, already_dense_input_x,
+ array_reduced[ thread_id ],
+ lower_bound, upper_bound,
+ local_y, local_x, x, y, local_y_nz,
+ addMonoid, anyOp
+ );
+ }
+ } else {
+ // get raw alias
+ const InputType1 * __restrict__ a = internal::getRaw( x );
+ const InputType2 * __restrict__ b = internal::getRaw( y );
+
+ size_t i = lower_bound;
+ if( upper_bound > lower_bound ) {
+ while( i + AnyOp::blocksize < upper_bound ) {
+ // declare buffers
+ static_assert( AnyOp::blocksize > 0,
+ "Configuration error: vectorisation blocksize set to 0!" );
+
+ typename AnyOp::D1 xx[ AnyOp::blocksize ];
+ typename AnyOp::D2 yy[ AnyOp::blocksize ];
+ typename AnyOp::D3 zz[ AnyOp::blocksize ];
+
+ // prepare registers
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ xx[ k ] = static_cast< typename AnyOp::D1 >( a[ i ] );
+ yy[ k ] = static_cast< typename AnyOp::D2 >( b[ i++ ] );
+ }
+
+ // perform element-wise multiplication
+ if( internal::maybe_noop< AnyOp >::value ) {
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >();
+ }
+ }
+ for( size_t k = 0; k < AnyOp::blocksize; ++k ) {
+ apply( zz[ k ], xx[ k ], yy[ k ], anyOp );
+ }
+
+ // perform reduction into output element
+ addMonoid.getOperator().foldlArray( array_reduced[ thread_id ], zz,
+ AnyOp::blocksize );
+ //^--> note that this foldl operates on raw arrays,
+ // and thus should not be mistaken with a foldl
+ // on a grb::Vector.
+#ifdef _DEBUG
+ std::cout << "\t\t " << ( i - AnyOp::blocksize ) << "--" << i << ": "
+ << "running reduction = " << array_reduced[ thread_id ] << "\n";
+#endif
+ }
+
+ // perform element-by-element updates for remainder (if any)
+ for( ; i < upper_bound; ++i ) {
+ OutputType temp = addMonoid.template getIdentity< OutputType >();
+ apply( temp, a[ i ], b[ i ], anyOp );
+ foldr( temp, array_reduced[ thread_id ], addMonoid.getOperator() );
+ }
+ }
+ }
+
+ // the local coordinates for the input vectors have not been updated as
+ // they are read-only therefore, we don't need to invoke asyncJoinSubset;
+ // the output is a scalar
+ return rc;
+ };
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: dot-generic" << std::endl;
+#endif
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_DOT_GENERIC,
+ end, sizeof( OutputType ), dense_descr, true,
+ nullptr, nullptr, nullptr, nullptr,
+ &x, &y, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+ for(
+ size_t i = 0;
+ i < reduced_size;
+ i += config::CACHE_LINE_SIZE::value()
+ ) {
+ foldl( reduced, array_reduced[ i ], addMonoid.getOperator() );
+ }
+
+ // write back result
+ z = static_cast< OutputType >( reduced );
+ } else {
+ // this has been tested by the unittest
+ }
+
+#ifdef _DEBUG
+ std::cout << "\t returning " << z << "\n";
+#endif
+ // done!
+ return ret;
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AddMonoid,
+ class AnyOp,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC dot(
+ OutputType &z,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const AddMonoid &addMonoid = AddMonoid(),
+ const AnyOp &anyOp = AnyOp(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< AddMonoid >::value &&
+ grb::is_operator< AnyOp >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType1, typename AnyOp::D1 >::value ), "grb::dot",
+ "called with a left-hand vector value type that does not match the first "
+ "domain of the given multiplicative operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, typename AnyOp::D2 >::value ), "grb::dot",
+ "called with a right-hand vector value type that does not match the second "
+ "domain of the given multiplicative operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ),
+ "grb::dot",
+ "called with a multiplicative operator output domain that does not match "
+ "the first domain of the given additive operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename AddMonoid::D2 >::value ), "grb::dot",
+ "called with an output vector value type that does not match the second "
+ "domain of the given additive operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ),
+ "grb::dot",
+ "called with an additive operator whose output domain does not match its "
+ "second input domain" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, typename AddMonoid::D3 >::value ), "grb::dot",
+ "called with an output vector value type that does not match the third "
+ "domain of the given additive operator" );
+
+#ifdef _DEBUG
+ std::cout << "In grb::dot (nonblocking). "
+ << "I/O scalar on input reads " << z << "\n";
+#endif
+
+ // dynamic sanity check
+ const size_t n = internal::getCoordinates( y ).size();
+ if( internal::getCoordinates( x ).size() != n ) {
+ return MISMATCH;
+ }
+
+#ifdef _DEBUG
+ std::cout << "\t dynamic checks pass\n";
+#endif
+
+ // dot will be computed out-of-place here. A separate field is needed because
+ // of possible multi-threaded computation of the dot.
+ OutputType oop = addMonoid.template getIdentity< OutputType >();
+
+ RC ret = SUCCESS;
+
+ ret = internal::dot_generic< descr >( oop, x, y, addMonoid, anyOp, phase );
+
+ // fold out-of-place dot product into existing input, and exit
+#ifdef _DEBUG
+ std::cout << "\t dot_generic returned " << oop << ", "
+ << "which will be folded into " << z << " "
+ << "using the additive monoid\n";
+#endif
+ ret = ret ? ret : foldl( z, oop, addMonoid.getOperator() );
+#ifdef _DEBUG
+ std::cout << "\t returning " << z << "\n";
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC dot(
+ IOType &x,
+ const Vector< InputType1, nonblocking, Coords > &left,
+ const Vector< InputType2, nonblocking, Coords > &right,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< IOType >::value &&
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In grb::dot (nonblocking, semiring version)\n"
+ << "\t dispatches to monoid-operator version\n";
+#endif
+ return grb::dot< descr >( x, left, right, ring.getAdditiveMonoid(),
+ ring.getMultiplicativeOperator(), phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Func,
+ typename DataType,
+ typename Coords
+ >
+ RC eWiseMap( const Func f, Vector< DataType, nonblocking, Coords > &x ) {
+
+ RC ret = SUCCESS;
+
+ const size_t n = internal::getCoordinates( x ).size();
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [f, &x] (
+ internal::Pipeline &pipeline, const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseMap(f, x) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+ bool sparse = false;
+
+ bool already_dense_input_x = true;
+
+ if( !dense_descr ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_input_x ) {
+#else
+ already_dense_input_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+ // the sparse case is possible only when the local coordinates are already
+ // initialized
+ assert( already_dense_input_x == false );
+ for( size_t k = 0; k < local_x_nz; ++k ) {
+ DataType &xval = internal::getRaw( x )[ local_x.index( k ) + lower_bound ];
+ xval = f( xval );
+ }
+ } else {
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ DataType &xval = internal::getRaw( x )[ i ];
+ xval = f( xval );
+ }
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISEMAP,
+ n, sizeof( DataType ), dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseMap(f, x)" << std::endl;
+#endif
+ return ret;
+ }
+
+ namespace internal {
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Func,
+ typename DataType1,
+ typename DataType2,
+ typename Coords,
+ typename... Args
+ >
+ RC eWiseLambda_helper(
+ std::vector< const void * > all_vectors_ptr,
+ size_t maximum_data_type_size,
+ const Func f,
+ const Vector< DataType1, nonblocking, Coords > &x,
+ const Vector< DataType2, nonblocking, Coords > &y,
+ Args const &... args
+ ) {
+ // catch mismatch
+ if( size( x ) != size( y ) ) {
+ return MISMATCH;
+ }
+
+ all_vectors_ptr.push_back( &y );
+ maximum_data_type_size = std::max( maximum_data_type_size, sizeof( DataType2 ) );
+
+ // continue
+ return eWiseLambda_helper( all_vectors_ptr, maximum_data_type_size, f, x,
+ args... );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Func,
+ typename DataType,
+ typename Coords
+ >
+ RC eWiseLambda_helper(
+ std::vector< const void * > all_vectors_ptr,
+ size_t maximum_data_type_size,
+ const Func f,
+ const Vector< DataType, nonblocking, Coords > &x
+ ) {
+ // all pointers, except one, have been stored, and the last one will be
+ // stored by the normal eWiseLambda
+ return eWiseLambda< descr, Func, DataType, Coords >( f, x, all_vectors_ptr,
+ maximum_data_type_size );
+ }
+ };
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Func,
+ typename DataType1,
+ typename DataType2,
+ typename Coords,
+ typename... Args
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Vector< DataType1, nonblocking, Coords > &x,
+ const Vector< DataType2, nonblocking, Coords > &y,
+ Args const &... args
+ ) {
+
+ // create an empty vector to store pointers for all vectors passed to
+ // eWiseLambda
+ std::vector< const void * > all_vectors_ptr;
+
+ // invoke the helper function to store the pointers
+ return internal::eWiseLambda_helper( all_vectors_ptr, 0, f, x, y, args...);
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename Func,
+ typename DataType,
+ typename Coords
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Vector< DataType, nonblocking, Coords > &x,
+ std::vector< const void * > all_vectors_ptr = std::vector< const void *>(),
+ size_t maximum_data_type_size = 0
+ ) {
+#ifdef _DEBUG
+ std::cout << "Info: entering eWiseLambda function on vectors.\n";
+#endif
+
+ all_vectors_ptr.push_back( &x );
+ maximum_data_type_size =
+ std::max( maximum_data_type_size, sizeof( DataType ) );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [f, &x] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage eWiseLambda in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ Coords local_x;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( local_x_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( sparse ) {
+ if ( already_dense_output ) {
+ for( size_t k = 0; k < local_x_nz; ++k ) {
+ f( k + lower_bound );
+ }
+ } else {
+ for( size_t k = 0; k < local_x_nz; ++k ) {
+ const size_t i = local_x.index( k ) + lower_bound;
+ f( i );
+ }
+ }
+ } else {
+ for (size_t i = lower_bound; i < upper_bound; i++) {
+ f( i );
+ }
+ }
+
+ // the local coordinates for all vectors of eWiseLambda cannot change
+ // therefore, we don't need to invoke asyncJoinSubset for any of them
+
+ return SUCCESS;
+ };
+
+ // eWiseLambda is a special case as we don't know which of the accessed
+ // vectors are read-only therefore, we assume that all vectors may be written,
+ // but the sparsity structure cannot change i.e., the coordinates of each
+ // vector cannot be updated, but we pass the coordinates of x for the loop
+ // size
+ ret = ret ? ret : internal::le.addeWiseLambdaStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_EWISELAMBDA,
+ internal::getCoordinates( x ).size(), maximum_data_type_size, dense_descr,
+ all_vectors_ptr, &internal::getCoordinates( x )
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: eWiseLambda" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename InputType,
+ typename IOType,
+ typename MaskType,
+ typename Coords
+ >
+ RC foldl(
+ IOType &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid = Monoid(),
+ const typename std::enable_if< !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+ << "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+ << "It has a mask of size " << size( mask ) << " with " << nnz( mask )
+ << " nonzeroes.\n";
+#endif
+
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, InputType >::value ), "grb::foldl",
+ "called with a scalar IO type that does not match the input vector type" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldl",
+ "called with an input vector value type that does not match the first "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl",
+ "called with an input vector type that does not match the second domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldl",
+ "called with an input vector type that does not match the third domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< bool, MaskType >::value ), "grb::foldl",
+ "called with a vector mask type that is not boolean" );
+
+ if( size( mask ) > 0 ) {
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, true, true
+ >( x, y, mask, monoid );
+ } else {
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, false, true
+ >( x, y, mask, monoid );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Monoid,
+ typename IOType,
+ typename InputType,
+ typename Coords
+ >
+ RC foldl(
+ IOType &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Monoid &monoid = Monoid(),
+ const typename std::enable_if<
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType >::value &&
+ grb::is_monoid< Monoid >::value, void
+ >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "foldl: IOType <- [InputType] with a monoid called. "
+ << "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. "
+ << "It has no mask.\n";
+#endif
+
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< IOType, InputType >::value ), "grb::reduce",
+ "called with a scalar IO type that does not match the input vector type" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D1 >::value ), "grb::reduce",
+ "called with an input vector value type that does not match the first "
+ "domain of the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D2 >::value ), "grb::reduce",
+ "called with an input vector type that does not match the second domain of "
+ "the given monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, typename Monoid::D3 >::value ), "grb::reduce",
+ "called with an input vector type that does not match the third domain of "
+ "the given monoid" );
+
+ // do reduction
+ Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return internal::template fold_from_vector_to_scalar_generic<
+ descr, false, true
+ >( x, y, empty_mask, monoid );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename T,
+ typename U,
+ typename Coords
+ >
+ RC zip(
+ Vector< std::pair< T, U >, nonblocking, Coords > &z,
+ const Vector< T, nonblocking, Coords > &x,
+ const Vector< U, nonblocking, Coords > &y,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< T >::value &&
+ !grb::is_object< U >::value,
+ void >::type * const = nullptr
+ ) {
+ const size_t n = size( z );
+ if( n != size( x ) ) {
+ return MISMATCH;
+ }
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ const T * const x_raw = internal::getRaw( x );
+ const U * const y_raw = internal::getRaw( y );
+ std::pair< T, U > * z_raw = internal::getRaw( z );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&z, x_raw, y_raw, z_raw] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tExecution of stage zip(z, x, y) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_z;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ bool already_dense_output = true;
+#else
+ (void) pipeline;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( z ) );
+ if( !dense_descr && !already_dense_output ) {
+#else
+ if( !dense_descr ) {
+#endif
+ local_z = internal::getCoordinates( z ).asyncSubset( lower_bound,
+ upper_bound );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !dense_descr && !already_dense_output ) {
+#else
+ if( !dense_descr ) {
+#endif
+ // the result will always be dense
+ local_z.local_assignAllNotAlreadyAssigned();
+ }
+
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ z_raw[ i ].first = x_raw[ i ];
+ z_raw[ i ].second = y_raw[ i ];
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !dense_descr && !already_dense_output ) {
+#else
+ if( !dense_descr ) {
+#endif
+ internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_ZIP,
+ internal::getCoordinates( x ).size(), sizeof( T ) + sizeof( U ),
+ dense_descr, true,
+ &z, nullptr, &internal::getCoordinates( z ), nullptr,
+ &x, &y, nullptr, nullptr,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: zip(z, x, y)" << std::endl;
+#endif
+ return SUCCESS;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename T,
+ typename U,
+ typename Coords
+ >
+ RC unzip(
+ Vector< T, nonblocking, Coords > &x,
+ Vector< U, nonblocking, Coords > &y,
+ const Vector< std::pair< T, U >, nonblocking, Coords > &in,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< T >::value &&
+ !grb::is_object< U >::value,
+ void >::type * const = nullptr
+ ) {
+ const size_t n = size( in );
+ if( n != size( x ) ) {
+ return MISMATCH;
+ }
+ if( n != size( y ) ) {
+ return MISMATCH;
+ }
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ T * const x_raw = internal::getRaw( x );
+ U * const y_raw = internal::getRaw( y );
+ const std::pair< T, U > * in_raw = internal::getRaw( in );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&x, &y, x_raw, y_raw, in_raw] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage unzip(x, y, in) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y;
+
+ bool already_dense_output_x = true;
+ bool already_dense_output_y = true;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output_x = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !dense_descr && !already_dense_output_x ) {
+#else
+ if( !dense_descr ) {
+ already_dense_output_x = false;
+#endif
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x.local_assignAllNotAlreadyAssigned();
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !dense_descr && !already_dense_output_y ) {
+#else
+ if( !dense_descr ) {
+ already_dense_output_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y.local_assignAllNotAlreadyAssigned();
+ }
+
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ x_raw[ i ] = in_raw[ i ].first;
+ y_raw[ i ] = in_raw[ i ].second;
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !dense_descr && !already_dense_output_x ) {
+#else
+ if( !dense_descr ) {
+#endif
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !dense_descr && !already_dense_output_y ) {
+#else
+ if( !dense_descr ) {
+#endif
+ internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS1_UNZIP,
+ internal::getCoordinates( x ).size(), std::max( sizeof( T ), sizeof( U ) ),
+ dense_descr, true,
+ &x, &y,
+ &internal::getCoordinates( x ), &internal::getCoordinates( y ),
+ &in, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( in ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: unzip(x, y, in)" << std::endl;
+#endif
+ return SUCCESS;
+ }
+
+/** @} */
+// ^-- ends BLAS-1 NB module
+
+} // end namespace ``grb''
+
+#undef NO_CAST_ASSERT
+#undef NO_CAST_OP_ASSERT
+
+#endif // end `_H_GRB_NONBLOCKING_BLAS1'
+
diff --git a/include/graphblas/nonblocking/blas2.hpp b/include/graphblas/nonblocking/blas2.hpp
new file mode 100644
index 000000000..47501eacd
--- /dev/null
+++ b/include/graphblas/nonblocking/blas2.hpp
@@ -0,0 +1,1559 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Defines the nonblocking level-2 primitives
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS2
+#define _H_GRB_NONBLOCKING_BLAS2
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "coordinates.hpp"
+#include "forward.hpp"
+#include "matrix.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "boolean_dispatcher_blas2.hpp"
+
+#ifdef _DEBUG
+#include "spmd.hpp"
+#endif
+
+#define NO_CAST_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "****************************************************************" \
+ "****************************************************************" \
+ "**************************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "****************************************************************" \
+ "****************************************************************" \
+ "**************************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template " \
+ "parameters in this call to " y ".\n" \
+ "* Possible fix 2 | Provide objects with element types or " \
+ "domains that match the expected type.\n" \
+ "****************************************************************" \
+ "****************************************************************" \
+ "**************************************\n" );
+
+
+namespace grb {
+
+ namespace internal {
+
+ extern LazyEvaluation le;
+ }
+}
+
+namespace grb {
+
+ /**
+ * \addtogroup nonblocking
+ * @{
+ */
+
+ // put the generic mxv implementation in an internal namespace
+ namespace internal {
+
+ template<
+ bool output_dense,
+ bool left_handed,
+ class AdditiveMonoid,
+ class Multiplication,
+ template< typename > class One,
+ typename IOType,
+ typename InputType,
+ typename SourceType,
+ typename Coords
+ >
+ class addIdentityDuringMV<
+ nonblocking, true, output_dense, left_handed,
+ AdditiveMonoid, Multiplication, One,
+ IOType, InputType, SourceType, Coords
+ > {
+
+ public:
+
+ static void apply(
+ Vector< IOType, nonblocking, Coords > &destination_vector,
+ IOType * __restrict__ const &destination,
+ const size_t &destination_range,
+ const size_t &source_index,
+ const AdditiveMonoid &add,
+ const Multiplication &mul,
+ const SourceType &input_element,
+ const std::function< size_t( size_t ) > &src_local_to_global,
+ const std::function< size_t( size_t ) > &dst_global_to_local
+ ) {
+
+ }
+ };
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool input_masked,
+ bool left_handed,
+ template< typename > class One,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_destination_vector,
+ bool already_dense_mask_vector,
+#endif
+ class AdditiveMonoid,
+ class Multiplication,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename Coords,
+ typename RowColType,
+ typename NonzeroType
+ >
+ inline void vxm_inner_kernel_gather(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_destination_vector,
+ bool already_dense_mask_vector,
+#endif
+ RC &rc,
+ const size_t lower_bound,
+ Coords &local_destination_vector,
+ const Coords &local_mask_vector,
+ Vector< IOType, nonblocking, Coords > &destination_vector,
+ IOType &destination_element,
+ const size_t &destination_index,
+ const Vector< InputType1, nonblocking, Coords > &source_vector,
+ const InputType1 * __restrict__ const &source,
+ const size_t &source_range,
+ const internal::Compressed_Storage<
+ InputType2, RowColType, NonzeroType
+ > &matrix,
+ const Vector< InputType3, nonblocking, Coords > &mask_vector,
+ const InputType3 * __restrict__ const &mask,
+ const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+ const InputType4 * __restrict__ const &source_mask,
+ const AdditiveMonoid &add,
+ const Multiplication &mul,
+ const std::function< size_t( size_t ) > &src_local_to_global,
+ const std::function< size_t( size_t ) > &src_global_to_local,
+ const std::function< size_t( size_t ) > &dst_local_to_global
+ ) {
+#ifndef _DEBUG
+ (void) destination_vector;
+#endif
+ constexpr bool add_identity = descr & descriptors::add_identity;
+ constexpr bool dense_hint = descr & descriptors::dense;
+ constexpr bool explicit_zero = descr & descriptors::explicit_zero;
+#ifdef _DEBUG
+ constexpr bool use_index = descr & descriptors::use_index;
+#endif
+ assert( rc == SUCCESS );
+
+ // check whether we should compute output here
+ if( masked ) {
+ if( already_dense_mask_vector ) {
+ if( !internal::getCoordinates( mask_vector ).template
+ mask< descr >( destination_index, mask )
+ ) {
+#ifdef _DEBUG
+ std::cout << "Masks says to skip processing destination index " <<
+ destination_index << "\n";
+#endif
+ return;
+ }
+ } else {
+ if( !local_mask_vector.template
+ mask< descr >( destination_index - lower_bound, mask )
+ ) {
+#ifdef _DEBUG
+ std::cout << "Masks says to skip processing destination index " <<
+ destination_index << "\n";
+#endif
+ return;
+ }
+ }
+ }
+
+ // take shortcut, if possible
+ if( grb::has_immutable_nonzeroes< AdditiveMonoid >::value && (
+ already_dense_destination_vector ||
+ local_destination_vector.assigned( destination_index - lower_bound )
+ ) && destination_element != add.template getIdentity< IOType >()
+ ) {
+ return;
+ }
+
+ // start output
+ typename AdditiveMonoid::D3 output =
+ add.template getIdentity< typename AdditiveMonoid::D3 >();
+ bool set = false;
+
+ // if we need to add identity, do so first:
+ if( add_identity ) {
+ const size_t id_location = src_global_to_local( dst_local_to_global(
+ destination_index ) );
+ // the SpMV primitive may access non-local elements, and thus referring to
+ // the input vector by using local coordinates is incorrect
+ // the input vector of an SpMV cannot be updated, i.e., written, by another
+ // primitive executed in the same pipeline with the current SpMV
+ // therefore, in the current design, it's safe to use global coordinates for
+ // the input vector
+ if( ( !input_masked ||
+ internal::getCoordinates( source_mask_vector ).template
+ mask< descr >( id_location, source_mask )
+ ) && id_location < source_range
+ ) {
+ if( dense_hint || internal::getCoordinates( source_vector ).assigned( id_location ) ) {
+ typename AdditiveMonoid::D1 temp;
+ internal::CopyOrApplyWithIdentity<
+ !left_handed, typename AdditiveMonoid::D1, InputType1, One
+ >::set( temp, source_vector[ id_location ], mul );
+ internal::CopyOrApplyWithIdentity<
+ false, typename AdditiveMonoid::D3, typename AdditiveMonoid::D1,
+ AdditiveMonoid::template Identity
+ >::set( output, temp, add );
+ set = true;
+ }
+ }
+ }
+
+ // handle row or column at destination_index
+ // NOTE: This /em could be parallelised, but will probably only slow things
+ // down
+#ifdef _DEBUG
+ std::cout << "vxm_gather: processing destination index " << destination_index << " / "
+ << internal::getCoordinates( destination_vector ).size()
+ << ". Input matrix has " << ( matrix.col_start[ destination_index + 1 ] -
+ matrix.col_start[ destination_index ] ) << " nonzeroes.\n";
+#endif
+ for(
+ size_t k = matrix.col_start[ destination_index ];
+ rc == SUCCESS &&
+ k < static_cast< size_t >( matrix.col_start[ destination_index + 1 ] );
+ ++k
+ ) {
+ // declare multiplication output field
+ typename Multiplication::D3 result =
+ add.template getIdentity< typename AdditiveMonoid::D3 >();
+ // get source index
+ const size_t source_index = matrix.row_index[ k ];
+ // check mask
+ if( input_masked &&
+ !internal::getCoordinates( source_mask_vector ).template
+ mask< descr >( source_index, source_mask )
+ ) {
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: skipping source index " << source_index
+ << " due to input mask\n";
+#endif
+ continue;
+ }
+ // check for sparsity at source
+ if( !dense_hint ) {
+ if( !internal::getCoordinates( source_vector ).assigned( source_index ) ) {
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: Skipping out of computation with source "
+ << "index " << source_index << " since it does not contain a nonzero\n";
+#endif
+ continue;
+ }
+ }
+ // get nonzero
+ typedef typename std::conditional<
+ left_handed,
+ typename Multiplication::D2,
+ typename Multiplication::D1
+ >::type RingNonzeroType;
+ const RingNonzeroType nonzero =
+ matrix.template getValue( k, One< RingNonzeroType >::value() );
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: interpreted nonzero is " << nonzero << ", "
+ << "which is the " << k << "-th nonzero and has source index "
+ << source_index << "\n";
+#endif
+ // check if we use source element or whether we use its index value instead
+ typedef typename std::conditional<
+ left_handed,
+ typename Multiplication::D1,
+ typename Multiplication::D2
+ >::type SourceType;
+ const SourceType apply_source = internal::ValueOrIndex<
+ descr, SourceType, InputType1
+ >::getFromArray( source, src_local_to_global, source_index );
+#ifdef _DEBUG
+ if( use_index ) {
+ std::cout << "\t vxm_gather (use_index descriptor): apply( output, matrix "
+ << "nonzero, vector nonzero, * ) = apply( ";
+ } else {
+ std::cout << "\t vxm_gather: apply( output, matrix nonzero, vector "
+ << "nonzero, * ) = apply( ";
+ }
+ std::cout << " output, " << nonzero << ", " << source << ", * )\n";
+#endif
+ //multiply
+ internal::leftOrRightHandedMul<
+ left_handed, typename Multiplication::D3,
+ SourceType, RingNonzeroType, Multiplication
+ >::mul( result, apply_source, nonzero, mul );
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: output (this nonzero) = " << result << "\n";
+#endif
+
+ // accumulate
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: foldr( " << result << ", " << output
+ << ", + );\n";
+#endif
+ rc = foldr( result, output, add.getOperator() );
+#ifdef _DEBUG
+ std::cout << "\t vxm_gather: output (sum at destination) = " << output
+ << "\n";
+#endif
+ set = true;
+
+ // sanity check (but apply cannot fail)
+ assert( rc == SUCCESS );
+ }
+
+#ifdef _DEBUG
+ if( set ) {
+ std::cout << "\t vxm_gather: local contribution to this output element at "
+ << "index " << destination_index << " will be " << output << " "
+ << "and this corresponds to an explicitly set nonzero.\n";
+ } else {
+ std::cout << "\t vxm_gather: local contribution to this output element at "
+ << "index " << destination_index << " will be " << output << " and this "
+ << "is an unset value.\n";
+ if( already_dense_destination_vector ||
+ local_destination_vector.assigned( destination_index - lower_bound )
+ ) {
+ std::cout << "\t(old value " << destination_element << " will remain "
+ << "unmodified.)\n";
+ } else {
+ std::cout << "\t(no old value existed so the output vector will remain "
+ << "unset at this index.)\n";
+ }
+ }
+#endif
+ // finally, accumulate in output
+ if( explicit_zero || set ) {
+#ifdef _DEBUG
+ std::cout << "\taccumulating " << output << " into output vector...\n";
+#endif
+ if( already_dense_destination_vector ||
+ local_destination_vector.assign( destination_index - lower_bound )
+ ) {
+#ifdef _DEBUG
+ std::cout << "\tfoldl( " << destination_element << ", " << output << ", "
+ << "add.getOperator() );, destination_element = ";
+#endif
+ rc = foldl( destination_element, output, add.getOperator() );
+#ifdef _DEBUG
+ std::cout << destination_element << "\n";
+#endif
+ } else {
+#ifdef _DEBUG
+ std::cout << "\toutput vector element was previously not set. Old "
+ << "(possibly uninitialised value) " << destination_element << " will "
+ << "now be set to " << output << ", result (after, possibly, casting): ";
+#endif
+ destination_element = static_cast< IOType >( output );
+#ifdef _DEBUG
+ std::cout << destination_element << "\n";
+#endif
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool input_masked,
+ bool left_handed,
+ bool using_semiring,
+ template< typename > class One,
+ class AdditiveMonoid,
+ class Multiplication,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm_generic(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Vector< InputType4, nonblocking, Coords > &v_mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const AdditiveMonoid &add,
+ const Multiplication &mul,
+ const Phase &phase,
+ const std::function< size_t( size_t ) > row_l2g,
+ const std::function< size_t( size_t ) > row_g2l,
+ const std::function< size_t( size_t ) > col_l2g,
+ const std::function< size_t( size_t ) > col_g2l
+ ) {
+ // type sanity checking
+ NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+ !(descr & descriptors::no_casting) ||
+ std::is_same< InputType3, bool >::value
+ ), "vxm (any variant)",
+ "Mask type is not boolean" );
+ NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+ !(descr & descriptors::no_casting) ||
+ !left_handed ||
+ std::is_same< InputType1, typename Multiplication::D1 >::value
+ ), "vxm (any variant)",
+ "Input vector type does not match multiplicative operator first "
+ "input domain" );
+ NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+ !(descr & descriptors::no_casting) ||
+ left_handed ||
+ std::is_same< InputType2, typename Multiplication::D1 >::value
+ ), "vxm (any variant)",
+ "Input vector type does not match multiplicative operator second "
+ "input domain" );
+ NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+ !(descr & descriptors::no_casting) ||
+ !left_handed ||
+ std::is_same< InputType2, typename Multiplication::D2 >::value
+ ), "vxm (any variant)",
+ "Input matrix type does not match multiplicative operator second "
+ "input domain" );
+ NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE ||
+ !(descr & descriptors::no_casting) ||
+ left_handed ||
+ std::is_same< InputType1, typename Multiplication::D2 >::value
+ ), "vxm (any variant)",
+ "Input matrix type does not match multiplicative operator first "
+ "input domain" );
+
+ RC ret = SUCCESS;
+
+#ifdef _DEBUG
+ const auto s = spmd< nonblocking >::pid();
+ std::cout << s << ": nonblocking vxm called with a "
+ << descriptors::toString( descr ) << "\n";
+#endif
+
+ // get input and output vector sizes
+ const size_t m = internal::getCoordinates( u ).size();
+ const size_t n = internal::getCoordinates( v ).size();
+
+ // get whether the matrix should be transposed prior to execution of this
+ // vector-times-matrix operation
+ constexpr bool transposed = descr & descriptors::transpose_matrix;
+
+ // check for dimension mismatch
+ if( ( transposed && ( n != ncols( A ) || m != nrows( A ) ) )
+ || ( !transposed && ( n != nrows( A ) || m != ncols( A ) ) ) ) {
+#ifdef _DEBUG
+ std::cout << "Mismatch of columns ( " << n << " vs. " << ncols( A )
+ << " ) or rows ( " << m << " vs. " << nrows( A ) << " ) with "
+ << "transposed value " << ((int)transposed) << "\n";
+#endif
+ return MISMATCH;
+ }
+
+ // check density
+ if( descr & descriptors::dense ) {
+ // it's safe to check the number of nonzeroes for the input vector and its
+ // mask since both of them are read-only in the current design for
+ // nonblocking execution
+ if( nnz( v ) < size( v ) ) {
+#ifdef _DEBUG
+ std::cout << "\t Dense descriptor given but input vector was sparse\n";
+#endif
+ return ILLEGAL;
+ }
+ if( size( v_mask ) > 0 && nnz( v_mask ) < size( v_mask ) ) {
+#ifdef _DEBUG
+ std::cout << "\t Dense descriptor given but input mask has sparse "
+ << "structure\n";
+#endif
+ return ILLEGAL;
+ }
+ }
+
+ // check mask
+ if( masked ) {
+ if( (transposed && internal::getCoordinates( mask ).size() != nrows( A ) ) ||
+ ( !transposed && internal::getCoordinates( mask ).size() != ncols( A ) )
+ ) {
+#ifdef _DEBUG
+ std::cout << "Mismatch of mask size ( "
+ << internal::getCoordinates( mask ).size() << " ) versus matrix rows "
+ << "or columns ( " << nrows( A ) << " or " << ncols( A ) << " with "
+ << "transposed value " << ((int)transposed) << "\n";
+#endif
+ return MISMATCH;
+ }
+ }
+
+ // handle resize phase
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // get raw pointers
+ assert( phase == EXECUTE );
+ const InputType1 * __restrict__ const x = internal::getRaw( v );
+ const InputType3 * __restrict__ const z = internal::getRaw( mask );
+ const InputType4 * __restrict__ const vm = internal::getRaw( v_mask );
+ IOType * __restrict__ const y = internal::getRaw( u );
+
+ // check for illegal arguments
+ if( !(descr & descriptors::safe_overlap) &&
+ reinterpret_cast< const void * >( y ) ==
+ reinterpret_cast< const void * >( x )
+ ) {
+ std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+ << "input and output vectors.\n";
+ return OVERLAP;
+ }
+ if( masked && (reinterpret_cast(y) ==
+ reinterpret_cast(z))
+ ) {
+ std::cerr << "Warning: grb::internal::vxm_generic called with overlapping "
+ << "mask and output vectors.\n";
+ return OVERLAP;
+ }
+
+#ifdef _DEBUG
+ std::cout << s << ": performing SpMV / SpMSpV using an " << nrows( A )
+ << " by " << ncols( A ) << " matrix holding " << nnz( A )
+ << " nonzeroes.\n";
+#endif
+
+ // in the current design for nonblocking execution, the input vectors of
+ // vxm_generic // cannot be overwritten by another stage of the same
+ // pipeline, and therefore, it's safe to rely on the global coordinates of
+ // the input vectors, as they are read-only this property is of special
+ // importance when handling matrices of size "m" x "n" since the mismatch
+ // between "m" and "n" requires special handling for the local coordinates of
+ // the input vectors, the current design relies on the size of the output
+ // vector which should match the sizes of all other vectors in the pipeline
+ // the size of the input vector does not have to match the size of the other
+ // vectors as long as the input vectors are read-only
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [
+ &u, &mask, &v, &v_mask, &A, &add, &mul,
+ row_l2g, row_g2l, col_l2g, col_g2l,
+ y, x, z, vm
+#ifdef _DEBUG
+ , s
+#endif
+ ] (
+ internal::Pipeline &pipeline,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage vxm_generic in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ (void) pipeline;
+
+ RC rc = SUCCESS;
+
+ Coords local_u, local_mask;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_mask_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_output = true;
+ bool already_dense_output_mask = true;
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( u ) );
+ if( !already_dense_output ) {
+#else
+ already_dense_output = false;
+#endif
+ local_u = internal::getCoordinates( u ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ if( masked ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_output_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_output_mask ) {
+#else
+ already_dense_output_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+ }
+
+ // check if transpose is required
+ if( descr & descriptors::transpose_matrix ) {
+ // start compute u=vA^T
+#ifdef _DEBUG
+ std::cout << s << ": in u=vA^T=Av variant\n";
+#endif
+
+ // start u=vA^T using CRS
+ // matrix = &(A.CRS);
+ // TODO internal issue #193
+ if( !masked || (descr & descriptors::invert_mask) ) {
+ // loop over all columns of the input matrix (can be done in parallel):
+#ifdef _DEBUG
+ std::cout << s << ": in full CRS variant (gather)\n";
+#endif
+
+ for( size_t i = lower_bound; i < upper_bound; i++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+ vxm_inner_kernel_gather<
+#endif
+ descr, masked, input_masked, left_handed, One
+ >(
+ already_dense_output, already_dense_output_mask,
+ rc, lower_bound, local_u, local_mask,
+ u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+ mask, z, v_mask, vm, add, mul,
+ row_l2g, col_l2g, col_g2l
+ );
+ }
+
+ } else {
+#ifdef _DEBUG
+ std::cout << s << ": in masked CRS variant (gather). Mask has "
+ << local_mask_nz << " nonzeroes and size " << local_n << ":\n";
+ for( size_t k = 0; k < local_mask_nz; ++k ) {
+ std::cout << " "
+ << ( ( already_dense_output_mask ? k : local_mask.index( k ) ) +
+ lower_bound );
+ }
+ std::cout << "\n";
+#endif
+ assert( masked );
+
+ for( size_t k = 0; k < local_mask_nz; ++k ) {
+ const size_t i =
+ ( already_dense_output_mask ? k : local_mask.index( k ) ) +
+ lower_bound;
+ assert( i < nrows(A) );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+ vxm_inner_kernel_gather<
+#endif
+ descr, false, input_masked, left_handed, One
+ >(
+ already_dense_output, already_dense_output_mask,
+ rc, lower_bound, local_u, local_mask,
+ u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ),
+ mask, z, v_mask, vm, add, mul,
+ row_l2g, col_l2g, col_g2l
+ );
+ }
+ }
+ // end compute u=vA^T
+ } else {
+#ifdef _DEBUG
+ std::cout << s << ": in u=vA=A^Tv variant\n";
+#endif
+ // start u=vA using CCS
+#ifdef _DEBUG
+ std::cout << s << ": in column-major vector times matrix variant (u=vA)\n"
+ << "\t(this variant relies on the gathering inner kernel)\n";
+#endif
+
+ // if not transposed, then CCS is the data structure to go:
+ // TODO internal issue #193
+ if( !masked || (descr & descriptors::invert_mask) ) {
+#ifdef _DEBUG
+ std::cout << s << ": loop over all input matrix columns\n";
+#endif
+
+ for( size_t j = lower_bound; j < upper_bound; j++ ) {
+#ifdef GRB_BOOLEAN_DISPATCHER
+ boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+ vxm_inner_kernel_gather<
+#endif
+ descr, masked, input_masked, left_handed, One
+ >(
+ already_dense_output, already_dense_output_mask,
+ rc, lower_bound, local_u, local_mask,
+ u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+ mask, z, v_mask, vm, add, mul,
+ row_l2g, row_g2l, col_l2g
+ );
+ }
+ } else {
+ // loop only over the nonzero masks (can still be done in parallel!)
+#ifdef _DEBUG
+ std::cout << s << ": loop over mask indices\n";
+#endif
+ assert( masked );
+
+ for( size_t k = 0; k < local_mask_nz; ++k ) {
+ const size_t j =
+ ( already_dense_output_mask ? k : local_mask.index( k ) ) + lower_bound;
+#ifdef GRB_BOOLEAN_DISPATCHER
+ boolean_dispatcher_vxm_inner_kernel_gather<
+#else
+ vxm_inner_kernel_gather<
+#endif
+ descr, masked, input_masked, left_handed, One
+ >(
+ already_dense_output, already_dense_output_mask,
+ rc, lower_bound, local_u, local_mask,
+ u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ),
+ mask, z, v_mask, vm, add, mul,
+ row_l2g, row_g2l, col_l2g
+ );
+ }
+ }
+ // end computing u=vA
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ if( !already_dense_output ) {
+#else
+ if( !already_dense_vectors ) {
+#endif
+ internal::getCoordinates( u ).asyncJoinSubset( local_u, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ // since the local coordinates are never used for the input vector and the
+ // input mask they are added only for verification of legal usage of the
+ // dense descriptor
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::BLAS2_VXM_GENERIC,
+ size( u ), sizeof( IOType ), dense_descr, true,
+ &u, nullptr, &internal::getCoordinates( u ), nullptr,
+ &v,
+ masked ? &mask : nullptr,
+ input_masked ? &v_mask : nullptr,
+ nullptr,
+ &internal::getCoordinates( v ),
+ masked ? &internal::getCoordinates( mask ) : nullptr,
+ input_masked ? &internal::getCoordinates( v_mask ) : nullptr,
+ nullptr,
+ &A
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: vxm_generic" << std::endl;
+#endif
+
+#ifdef _DEBUG
+ std::cout << s << ": exiting SpMV / SpMSpV.\n" << std::flush;
+#endif
+ return ret;
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return vxm< descr, true, false >( u, mask, v, empty_mask, A, ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid,
+ class MultiplicativeOperator,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ const grb::Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return vxm< descr, true, false >( u, mask, v, empty_mask, A, add, mul,
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class Ring,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Vector< InputType4, nonblocking, Coords > &v_mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ constexpr bool left_sided = true;
+ if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+
+ return internal::vxm_generic<
+ descr, true, false, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( input_may_be_masked && size( mask ) == 0 && size( v_mask ) > 0 ) {
+ return internal::vxm_generic<
+ descr, false, true, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && size( v_mask ) > 0 ) {
+ return internal::vxm_generic<
+ descr, true, true, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else {
+ assert( size( mask ) == 0 );
+ assert( size( v_mask ) == 0 );
+ return internal::vxm_generic<
+ descr, false, false, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename Coords,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Ring &ring = Ring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, ring,
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid,
+ class MultiplicativeOperator,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, add, mul,
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename Coords,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2,
+ typename InputType3 = bool
+ >
+ RC mxv(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value, void
+ >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class Ring,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC mxv(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Vector< InputType4, nonblocking, Coords > &v_mask,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+ constexpr bool left_sided = false;
+ if( output_may_be_masked && ( size( v_mask ) == 0 && size( mask ) > 0 ) ) {
+
+ return internal::vxm_generic<
+ new_descr, true, false, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( input_may_be_masked && ( size( mask ) == 0 &&
+ size( v_mask ) > 0 )
+ ) {
+ return internal::vxm_generic<
+ new_descr, false, true, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+ size( v_mask ) > 0
+ ) {
+ return internal::vxm_generic<
+ new_descr, true, true, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else {
+ assert( size( mask ) == 0 );
+ assert( size( v_mask ) == 0 );
+ return internal::vxm_generic<
+ new_descr, false, false, left_sided, true, Ring::template One
+ >(
+ u, mask, v, v_mask, A,
+ ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(),
+ phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Ring,
+ typename Coords,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename IOType = typename Ring::D4,
+ typename InputType1 = typename Ring::D1,
+ typename InputType2 = typename Ring::D2
+ >
+ RC mxv(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Ring &ring,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_semiring< Ring >::value,
+ void >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, ring,
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class AdditiveMonoid,
+ class MultiplicativeOperator,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC mxv(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ const Vector< bool, nonblocking, Coords > empty_mask( 0 );
+ return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, add, mul,
+ phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class AdditiveMonoid,
+ class MultiplicativeOperator,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC vxm(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Vector< InputType4, nonblocking, Coords > &v_mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ static_assert( !(descr & descriptors::add_identity), "Cannot add an "
+ "identity if no concept of `one' is known. Suggested fix: use a semiring "
+ "instead." );
+ constexpr bool left_sided = true;
+ if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+ return internal::vxm_generic<
+ descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( input_may_be_masked && size( v_mask ) > 0 && size( mask ) == 0 ) {
+ return internal::vxm_generic<
+ descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+ size( v_mask ) > 0
+ ) {
+ return internal::vxm_generic<
+ descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else {
+ assert( size( mask ) == 0 );
+ assert( size( v_mask ) == 0 );
+ return internal::vxm_generic<
+ descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool output_may_be_masked = true,
+ bool input_may_be_masked = true,
+ class AdditiveMonoid,
+ class MultiplicativeOperator,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename Coords
+ >
+ RC mxv(
+ Vector< IOType, nonblocking, Coords > &u,
+ const Vector< InputType3, nonblocking, Coords > &mask,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< InputType1, nonblocking, Coords > &v,
+ const Vector< InputType4, nonblocking, Coords > &v_mask,
+ const AdditiveMonoid &add = AdditiveMonoid(),
+ const MultiplicativeOperator &mul = MultiplicativeOperator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_monoid< AdditiveMonoid >::value &&
+ grb::is_operator< MultiplicativeOperator >::value &&
+ !grb::is_object< IOType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< InputType3 >::value &&
+ !grb::is_object< InputType4 >::value &&
+ !std::is_same< InputType2, void >::value,
+ void >::type * const = nullptr
+ ) {
+ static_assert( !(descr & descriptors::add_identity), "Cannot add an identity "
+ "if no concept of `1' is known. Suggested fix: use a semiring "
+ "instead." );
+ constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix;
+ constexpr bool left_sided = false;
+ if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) {
+ return internal::vxm_generic<
+ new_descr, true, false, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( input_may_be_masked && size( mask ) == 0 &&
+ size( v_mask ) > 0
+ ) {
+ return internal::vxm_generic<
+ new_descr, false, true, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 &&
+ size( v_mask ) > 0
+ ) {
+ return internal::vxm_generic<
+ new_descr, true, true, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ } else {
+ assert( size( mask ) == 0 );
+ assert( size( v_mask ) == 0 );
+ return internal::vxm_generic<
+ new_descr, false, false, left_sided, false, AdditiveMonoid::template Identity
+ >(
+ u, mask, v, v_mask, A, add, mul, phase,
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ },
+ []( const size_t i ) {
+ return i;
+ }
+ );
+ }
+ }
+
+ template<
+ class ActiveDistribution,
+ typename Func,
+ typename DataType,
+ typename RIT,
+ typename CIT,
+ typename NIT
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A,
+ const size_t s,
+ const size_t P
+ ) {
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: eWiseLambda (nonblocking, matrix variant) currently "
+ << "delegates to a blocking implementation.\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ internal::le.execution();
+
+ // second, delegate to the reference backend
+ return eWiseLambda< ActiveDistribution, Func, DataType, RIT, CIT, NIT >(
+ f, internal::getRefMatrix( A ), s, P );
+ }
+
+ template<
+ typename Func,
+ typename DataType1,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename DataType2,
+ typename Coords,
+ typename... Args
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType1, nonblocking, RIT, CIT, NIT > &A,
+ const Vector< DataType2, nonblocking, Coords > &x,
+ Args... args
+ ) {
+ // do size checking
+ if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) {
+ std::cerr << "Mismatching dimensions: given vector of size " << size( x )
+ << " has nothing to do with either matrix dimension (" << nrows( A )
+ << " nor " << ncols( A ) << ").\n";
+ return MISMATCH;
+ }
+
+ return eWiseLambda( f, A, args... );
+ }
+
+ /** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end _H_GRB_NONBLOCKING_BLAS2
+
diff --git a/include/graphblas/nonblocking/blas3.hpp b/include/graphblas/nonblocking/blas3.hpp
new file mode 100644
index 000000000..02afce1d6
--- /dev/null
+++ b/include/graphblas/nonblocking/blas3.hpp
@@ -0,0 +1,595 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the level-3 primitives for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BLAS3
+#define _H_GRB_NONBLOCKING_BLAS3
+
+#include //for std::enable_if
+
+#include
+#include
+
+#include "io.hpp"
+#include "matrix.hpp"
+
+#include
+
+#define NO_CAST_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template parameters " \
+ "in this call to " y ".\n" \
+ "* Possible fix 2 | For all mismatches in the domains of input " \
+ "parameters and the semiring domains, as specified in the " \
+ "documentation of the function " y ", supply a container argument of " \
+ "the expected type instead.\n" \
+ "* Possible fix 3 | Provide a compatible semiring where all domains " \
+ "match those of the container arguments, as specified in the " \
+ "documentation of the function " y ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" );
+
+
+namespace grb {
+
+ namespace internal {
+
+ extern LazyEvaluation le;
+
+ }
+
+}
+
+namespace grb {
+
+ namespace internal {
+
+ template<
+ bool allow_void,
+ Descriptor descr,
+ class MulMonoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ class Operator,
+ class Monoid
+ >
+ RC mxm_generic(
+ Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+ const Operator &oper,
+ const Monoid &monoid,
+ const MulMonoid &mulMonoid,
+ const Phase &phase,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value && !
+ grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ le.execution();
+
+ // second, delegate to the reference backend
+ return mxm_generic<
+ allow_void, descr, MulMonoid, OutputType,
+ InputType1, InputType2, RIT, CIT, NIT, Operator, Monoid
+ >(
+ getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+ oper, monoid, mulMonoid, phase
+ );
+ }
+
+ } // end namespace grb::internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ class Semiring
+ >
+ RC mxm(
+ Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+ const Semiring &ring = Semiring(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_semiring< Semiring >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Semiring::D1, InputType1 >::value
+ ), "grb::mxm",
+ "called with a prefactor input matrix A that does not match the first "
+ "domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Semiring::D2, InputType2 >::value ), "grb::mxm",
+ "called with a postfactor input matrix B that does not match the "
+ "second domain of the given operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Semiring::D4, OutputType >::value
+ ), "grb::mxm",
+ "called with an output matrix C that does not match the output domain "
+ "of the given operator" );
+
+#ifdef _DEBUG
+ std::cout << "In grb::mxm (nonblocking, unmasked, semiring)\n";
+#endif
+
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: mxm (nonblocking, unmasked, semiring) currently "
+ << "delegates to a blocking implementation\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ return internal::mxm_generic< true, descr >(
+ C, A, B,
+ ring.getMultiplicativeOperator(),
+ ring.getAdditiveMonoid(),
+ ring.getMultiplicativeMonoid(),
+ phase
+ );
+ }
+
+ template<
+ Descriptor descr = grb::descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ class Operator,
+ class Monoid
+ >
+ RC mxm(
+ Matrix< OutputType, nonblocking, RIT, CIT, NIT > &C,
+ const Matrix< InputType1, nonblocking, RIT, CIT, NIT > &A,
+ const Matrix< InputType2, nonblocking, RIT, CIT, NIT > &B,
+ const Monoid &addM,
+ const Operator &mulOp,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value &&
+ grb::is_monoid< Monoid >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Operator::D1, InputType1 >::value
+ ), "grb::mxm",
+ "called with a prefactor input matrix A that does not match the first "
+ "domain of the given multiplication operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Operator::D2, InputType2 >::value
+ ), "grb::mxm",
+ "called with a postfactor input matrix B that does not match the first "
+ "domain of the given multiplication operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Operator::D3, OutputType >::value ),
+ "grb::mxm",
+ "called with an output matrix C that does not match the output domain "
+ "of the given multiplication operator" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D1, typename Operator::D3 >::value
+ ), "grb::mxm",
+ "the output domain of the multiplication operator does not match the "
+ "first domain of the given addition monoid" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D2, OutputType >::value
+ ), "grb::mxm",
+ "the second domain of the given addition monoid does not match the "
+ "type of the output matrix C" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< typename Monoid::D3, OutputType >::value
+ ), "grb::mxm",
+ "the output type of the given addition monoid does not match the type "
+ "of the output matrix C" );
+ static_assert( ( !(
+ std::is_same< InputType1, void >::value ||
+ std::is_same< InputType2, void >::value
+ ) ),
+ "grb::mxm: the operator-monoid version of mxm cannot be used if either "
+ "of the input matrices is a pattern matrix (of type void)" );
+
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: mxm (nonblocking, unmasked, monoid-op) currently "
+ << "delegates to a blocking implementation\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ return internal::mxm_generic< false, descr >(
+ C, A, B, mulOp, addM, Monoid(), phase
+ );
+ }
+
+ namespace internal {
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ bool matrix_is_void,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC matrix_zip_generic(
+ Matrix< OutputType, nonblocking > &A,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Vector< InputType3, nonblocking, Coords > &z,
+ const Phase &phase
+ ) {
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: zip (matrix<-vector<-vector<-vector, nonblocking) "
+ << "currently delegates to a blocking implementation.\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ le.execution();
+
+ // second, delegate to the reference backend
+ return matrix_zip_generic<
+ descr, matrix_is_void,
+ OutputType, InputType1, InputType2, InputType3,
+ Coords
+ >(
+ getRefMatrix( A ), getRefVector( x ), getRefVector( y ), getRefVector( z ),
+ phase
+ );
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords
+ >
+ RC zip(
+ Matrix< OutputType, nonblocking > &A,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Vector< InputType3, nonblocking, Coords > &z,
+ const Phase &phase = EXECUTE
+ ) {
+ static_assert( !(descr & descriptors::no_casting) ||
+ std::is_integral< InputType1 >::value,
+ "grb::zip (two vectors to matrix) called "
+ "using non-integral left-hand vector elements" );
+ static_assert( !(descr & descriptors::no_casting) ||
+ std::is_integral< InputType2 >::value,
+ "grb::zip (two vectors to matrix) called "
+ "using non-integral right-hand vector elements" );
+ static_assert( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, InputType3 >::value,
+ "grb::zip (two vectors to matrix) called "
+ "with differing vector nonzero and output matrix domains" );
+
+ const size_t n = grb::size( x );
+ const size_t nz = grb::nnz( x );
+ const RC ret = grb::clear( A );
+ if( ret != SUCCESS ) {
+ return ret;
+ }
+ if( n != grb::size( y ) ) {
+ return MISMATCH;
+ }
+ if( n != grb::size( z ) ) {
+ return MISMATCH;
+ }
+ if( nz != grb::nnz( y ) ) {
+ return ILLEGAL;
+ }
+ if( nz != grb::nnz( z ) ) {
+ return ILLEGAL;
+ }
+
+ return internal::matrix_zip_generic< descr, false >( A, x, y, z, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC zip(
+ Matrix< void, nonblocking > &A,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const Phase &phase = EXECUTE
+ ) {
+ static_assert( !(descr & descriptors::no_casting) ||
+ std::is_integral< InputType1 >::value,
+ "grb::zip (two vectors to void matrix) called using non-integral "
+ "left-hand vector elements" );
+ static_assert( !(descr & descriptors::no_casting) ||
+ std::is_integral< InputType2 >::value,
+ "grb::zip (two vectors to void matrix) called using non-integral "
+ "right-hand vector elements" );
+
+ const size_t n = grb::size( x );
+ const size_t nz = grb::nnz( x );
+ const RC ret = grb::clear( A );
+ if( ret != SUCCESS ) {
+ return ret;
+ }
+ if( n != grb::size( y ) ) {
+ return MISMATCH;
+ }
+ if( nz != grb::nnz( y ) ) {
+ return ILLEGAL;
+ }
+
+ return internal::matrix_zip_generic< descr, true >( A, x, y, x, phase );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType1,
+ typename InputType2,
+ typename OutputType,
+ typename Coords,
+ class Operator
+ >
+ RC outer(
+ Matrix< OutputType, nonblocking > &A,
+ const Vector< InputType1, nonblocking, Coords > &u,
+ const Vector< InputType2, nonblocking, Coords > &v,
+ const Operator &mul = Operator(),
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ grb::is_operator< Operator >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ !grb::is_object< OutputType >::value,
+ void
+ >::type * const = nullptr
+ ) {
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: outer (nonblocking) currently delegates to a "
+ << "blocking implementation.\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ internal::le.execution();
+
+ // second, delegate to the reference backend
+ return outer<
+ descr, InputType1, InputType2, OutputType, Coords, Operator
+ >(
+ internal::getRefMatrix( A ),
+ internal::getRefVector( u ), internal::getRefVector( v ),
+ mul, phase
+ );
+ }
+
+ namespace internal {
+
+ template<
+ bool allow_void,
+ Descriptor descr,
+ class MulMonoid,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ class Operator
+ >
+ RC eWiseApply_matrix_generic(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType1, nonblocking > &A,
+ const Matrix< InputType2, nonblocking > &B,
+ const Operator &oper,
+ const MulMonoid &mulMonoid,
+ const Phase &phase,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value,
+ void >::type * const = nullptr
+ ) {
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: eWiseApply (nonblocking) currently delegates to a "
+ << "blocking implementation.\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ le.execution();
+
+ // second, delegate to the reference backend
+ return eWiseApply_matrix_generic<
+ allow_void, descr, MulMonoid, OutputType, InputType1, InputType2, Operator
+ >(
+ getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ),
+ oper, mulMonoid, phase
+ );
+ }
+
+ } // namespace internal
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ class MulMonoid
+ >
+ RC eWiseApply(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType1, nonblocking > &A,
+ const Matrix< InputType2, nonblocking > &B,
+ const MulMonoid &mulmono,
+ const Phase phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_monoid< MulMonoid >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename MulMonoid::D1, InputType1 >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+ "called with a prefactor input matrix A that does not match the first "
+ "domain of the monoid operator"
+ );
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename MulMonoid::D2, InputType2 >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+ "called with a postfactor input matrix B that does not match the "
+ "second domain of the monoid operator"
+ );
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename MulMonoid::D3, OutputType >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, monoid)",
+ "called with an output matrix C that does not match the output domain "
+ "of the monoid operator"
+ );
+
+#ifdef _DEBUG
+ std::cout << "In grb::eWiseApply_matrix_generic (nonblocking, monoid)\n";
+#endif
+
+ return internal::eWiseApply_matrix_generic< true, descr >(
+ C, A, B, mulmono.getOperator(), mulmono, phase
+ );
+ }
+
+ template<
+ Descriptor descr = grb::descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ class Operator
+ >
+ RC eWiseApply(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType1, nonblocking > &A,
+ const Matrix< InputType2, nonblocking > &B,
+ const Operator &mulOp,
+ const Phase phase = EXECUTE,
+ const typename std::enable_if< !grb::is_object< OutputType >::value &&
+ !grb::is_object< InputType1 >::value &&
+ !grb::is_object< InputType2 >::value &&
+ grb::is_operator< Operator >::value,
+ void >::type * const = nullptr
+ ) {
+ // static checks
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename Operator::D1, InputType1 >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+ "called with a prefactor input matrix A that does not match the first "
+ "domain of the given multiplication operator"
+ );
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename Operator::D2, InputType2 >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+ "called with a postfactor input matrix B that does not match the first "
+ "domain of the given multiplication operator"
+ );
+ NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) ||
+ std::is_same< typename Operator::D3, OutputType >::value ),
+ "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator)",
+ "called with an output matrix C that does not match the output domain "
+ "of the given multiplication operator"
+ );
+ static_assert( ( !(
+ std::is_same< InputType1, void >::value ||
+ std::is_same< InputType2, void >::value )
+ ), "grb::eWiseApply (nonblocking, matrix <- matrix x matrix, operator): "
+ "the operator version of eWiseApply cannot be used if either of the "
+ "input matrices is a pattern matrix (of type void)"
+ );
+
+ typename grb::Monoid<
+ grb::operators::mul< double >,
+ grb::identities::one
+ > dummyMonoid;
+ return internal::eWiseApply_matrix_generic< false, descr >(
+ C, A, B, mulOp, dummyMonoid, phase
+ );
+ }
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // ``_H_GRB_NONBLOCKING_BLAS3''
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp b/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp
new file mode 100644
index 000000000..16fc60a8d
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_blas1.hpp
@@ -0,0 +1,1744 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatcher functions for the level-1 primitives.
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS1
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS1
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+#include "vector_wrapper.hpp"
+
+
+namespace grb {
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_vectorDriven(
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ );
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven(
+ const bool already_dense_input_to_fold,
+ const bool already_dense_mask,
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ if( already_dense_input_to_fold ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_vectorDriven<
+ descr, masked, left, true, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_vectorDriven<
+ descr, masked, left, true, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_vectorDriven<
+ descr, masked, left, false, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_vectorDriven<
+ descr, masked, left, false, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool left,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_maskDriven(
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ );
+
+ template<
+ Descriptor descr,
+ bool left,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC boolean_dispatcher_fold_from_vector_to_scalar_maskDriven(
+ const bool already_dense_input_to_fold,
+ const bool already_dense_mask,
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ if( already_dense_input_to_fold ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_maskDriven<
+ descr, left, true, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_maskDriven<
+ descr, left, true, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_maskDriven<
+ descr, left, false, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_maskDriven<
+ descr, left, false, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC fold_from_vector_to_scalar_fullLoopSparse(
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ );
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool left,
+ class Monoid,
+ typename InputType,
+ typename MaskType,
+ class Coords
+ >
+ RC boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse(
+ const bool already_dense_input_to_fold,
+ const bool already_dense_mask,
+ typename Monoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_to_fold,
+ const Coords &local_mask,
+ const Vector< InputType, nonblocking, Coords > &to_fold,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Monoid &monoid
+ ) {
+ if( already_dense_input_to_fold ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_fullLoopSparse<
+ descr, masked, left, true, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_fullLoopSparse<
+ descr, masked, left, true, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_scalar_fullLoopSparse<
+ descr, masked, left, false, true
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ } else {
+ return internal::fold_from_vector_to_scalar_fullLoopSparse<
+ descr, masked, left, false, false
+ >(
+ thread_local_output, lower_bound, upper_bound,
+ local_to_fold, local_mask, to_fold, mask, monoid
+ );
+ }
+ }
+ }
+
+ template< Descriptor descr,
+ bool left,
+ bool sparse,
+ bool masked,
+ bool monoid,
+ bool already_dense_output,
+ bool already_dense_mask,
+ typename MaskType,
+ typename IOType,
+ typename InputType,
+ typename Coords,
+ class OP
+ >
+ RC fold_from_scalar_to_vector_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_vector,
+ const Coords * const local_mask_ptr,
+ Vector< IOType, nonblocking, Coords > &vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask,
+ const InputType &scalar,
+ const OP &op,
+ const Phase &phase
+ );
+
+ template< Descriptor descr,
+ bool left,
+ bool sparse,
+ bool masked,
+ bool monoid,
+ typename MaskType,
+ typename IOType,
+ typename InputType,
+ typename Coords,
+ class OP
+ >
+ RC boolean_dispatcher_fold_from_scalar_to_vector_generic(
+ const bool already_dense_output,
+ const bool already_dense_mask,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_vector,
+ const Coords * const local_mask_ptr,
+ Vector< IOType, nonblocking, Coords > &vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask,
+ const InputType &scalar,
+ const OP &op,
+ const Phase &phase
+ ) {
+ if( already_dense_output ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_scalar_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, true
+ >(
+ lower_bound, upper_bound, local_vector, local_mask_ptr,
+ vector, mask, scalar, op, phase
+ );
+ } else {
+ return internal::fold_from_scalar_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, false
+ >(
+ lower_bound, upper_bound, local_vector, local_mask_ptr,
+ vector, mask, scalar, op, phase
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_scalar_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, true
+ >(
+ lower_bound, upper_bound, local_vector, local_mask_ptr,
+ vector, mask, scalar, op, phase
+ );
+ } else {
+ return internal::fold_from_scalar_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, false
+ >(
+ lower_bound, upper_bound, local_vector, local_mask_ptr,
+ vector, mask, scalar, op, phase
+ );
+ }
+ }
+ }
+
+ template< Descriptor descr,
+ bool left,
+ bool sparse,
+ bool masked,
+ bool monoid,
+ bool already_dense_output,
+ bool already_dense_input_to_fold,
+ bool already_dense_mask,
+ typename MaskType,
+ typename IOType,
+ typename IType,
+ typename Coords,
+ class OP
+ >
+ RC fold_from_vector_to_vector_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_fold_into,
+ const Coords * const local_m_ptr,
+ const Coords &local_to_fold,
+ Vector< IOType, nonblocking, Coords > &fold_into,
+ const Vector< MaskType, nonblocking, Coords > * const m,
+ const Vector< IType, nonblocking, Coords > &to_fold,
+ const OP &op,
+ const Phase phase
+ );
+
+ template< Descriptor descr,
+ bool left,
+ bool sparse,
+ bool masked,
+ bool monoid,
+ typename MaskType,
+ typename IOType,
+ typename IType,
+ typename Coords,
+ class OP
+ >
+ RC boolean_dispatcher_fold_from_vector_to_vector_generic(
+ const bool already_dense_output,
+ const bool already_dense_input_to_fold,
+ const bool already_dense_mask,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_fold_into,
+ const Coords * const local_m_ptr,
+ const Coords &local_to_fold,
+ Vector< IOType, nonblocking, Coords > &fold_into,
+ const Vector< MaskType, nonblocking, Coords > * const m,
+ const Vector< IType, nonblocking, Coords > &to_fold,
+ const OP &op,
+ const Phase phase
+ ) {
+ if( already_dense_output ) {
+ if( already_dense_input_to_fold ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, true, true
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ } else {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, true, false
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, false, true
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ } else {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ true, false, false
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_to_fold ) {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, true, true
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ } else {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, true, false
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ }
+ } else {
+ if( already_dense_mask ) {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, false, true
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ } else {
+ return internal::fold_from_vector_to_vector_generic<
+ descr, left, sparse, masked, monoid,
+ false, false, false
+ >(
+ lower_bound, upper_bound, local_fold_into, local_m_ptr,
+ local_to_fold, fold_into, m, to_fold, op, phase
+ );
+ }
+ }
+ }
+ }
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr,
+ class OP,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC dense_apply_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ );
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr,
+ class OP,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC boolean_dispatcher_dense_apply_generic(
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::dense_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true
+ >(
+ lower_bound, upper_bound,
+ local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::dense_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true
+ >(
+ lower_bound, upper_bound,
+ local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::dense_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true
+ >(
+ lower_bound, upper_bound,
+ local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::dense_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true
+ >(
+ lower_bound, upper_bound,
+ local_x, local_y, z_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ }
+ }
+
+ template<
+ bool masked,
+ bool monoid,
+ bool x_scalar,
+ bool y_scalar,
+ Descriptor descr,
+ class OP,
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC sparse_apply_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_mask_ptr,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+ const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ );
+
+ template<
+ bool masked,
+ bool monoid,
+ bool x_scalar,
+ bool y_scalar,
+ Descriptor descr,
+ class OP,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC boolean_dispatcher_sparse_apply_generic(
+ const bool already_dense_mask,
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_mask_ptr,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const mask_vector,
+ const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper,
+ const OP &op
+ ) {
+ if( already_dense_mask ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ true, true, true
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ true, true, false
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ true, false, true
+ > (
+ lower_bound, upper_bound,
+ local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ true, false, false
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ false, true, true
+ > (
+ lower_bound, upper_bound,
+ local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ false, true, false
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ false, false, true
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ } else {
+ return internal::sparse_apply_generic<
+ masked, monoid, x_scalar, y_scalar, descr, OP,
+ false, false, false
+ > (
+ lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op
+ );
+ }
+ }
+ }
+ }
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr,
+ class OP,
+ bool already_dense_mask,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC masked_apply_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_mask,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &mask_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ const InputType1 * const left_identity,
+ const InputType2 * const right_identity
+#else
+ const InputType1 * const left_identity = nullptr,
+ const InputType2 * const right_identity = nullptr
+#endif
+ );
+
+ template<
+ bool left_scalar,
+ bool right_scalar,
+ bool left_sparse,
+ bool right_sparse,
+ Descriptor descr,
+ class OP,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC boolean_dispatcher_masked_apply_generic(
+ const bool already_dense_mask,
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_mask,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &mask_vector,
+ const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper,
+ const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper,
+ const OP &op,
+ const InputType1 * const left_identity = nullptr,
+ const InputType2 * const right_identity = nullptr
+ ) {
+ if( already_dense_mask ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper, op, left_identity, right_identity
+ );
+ } else {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ } else {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ } else {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ } else {
+ return internal::masked_apply_generic<
+ left_scalar, right_scalar, left_sparse, right_sparse, descr, OP,
+ false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_mask, local_x, local_y,
+ z_vector, mask_vector, x_wrapper, y_wrapper,
+ op, left_identity, right_identity
+ );
+ }
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool a_scalar,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC sparse_eWiseMulAdd_maskDriven(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &m_vector,
+ const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring
+ );
+
+ template<
+ Descriptor descr,
+ bool a_scalar,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC boolean_dispatcher_sparse_eWiseMulAdd_maskDriven(
+ const bool already_dense_output,
+ const bool already_dense_mask,
+ const bool already_dense_input_a,
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords &local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > &m_vector,
+ const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring
+ ) {
+ if( already_dense_output ) {
+ if( already_dense_mask ) {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, true, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ } else {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ true, false, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ }
+ } else {
+ if( already_dense_mask ) {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, false, false, true
+ >( lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, true, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ } else {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::sparse_eWiseMulAdd_maskDriven<
+ descr, a_scalar, x_scalar, y_scalar, y_zero,
+ false, false, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ bool mulSwitched,
+ bool already_dense_output,
+ bool already_dense_mask,
+ bool already_dense_input_a,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC twoPhase_sparse_eWiseMulAdd_mulDriven(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const m_vector,
+ const Vector< InputType1, nonblocking, Coords > &a_vector,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring
+ );
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool x_scalar,
+ bool y_scalar,
+ bool y_zero,
+ bool mulSwitched,
+ typename OutputType,
+ typename MaskType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename Coords,
+ class Ring
+ >
+ RC boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven(
+ const bool already_dense_output,
+ const bool already_dense_mask,
+ const bool already_dense_input_a,
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_z,
+ const Coords * const local_m,
+ const Coords &local_a,
+ const Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &z_vector,
+ const Vector< MaskType, nonblocking, Coords > * const m_vector,
+ const Vector< InputType1, nonblocking, Coords > &a_vector,
+ const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper,
+ const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper,
+ const Ring &ring = Ring()
+ ) {
+ if( already_dense_output ) {
+ if( already_dense_mask ) {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, true, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ } else {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ true, false, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ }
+ } else {
+ if( already_dense_mask ) {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, true, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ } else {
+ if( already_dense_input_a ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, true, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, true, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, true, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, true, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ } else {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, false, true, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, false, true, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, false, false, true
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ } else {
+ return internal::twoPhase_sparse_eWiseMulAdd_mulDriven<
+ descr, masked, x_scalar, y_scalar, y_zero, mulSwitched,
+ false, false, false, false, false
+ >(
+ lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y,
+ z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring
+ );
+ }
+ }
+ }
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool already_dense_input_x,
+ bool already_dense_input_y,
+ class AddMonoid,
+ class AnyOp,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC sparse_dot_generic(
+ typename AddMonoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const size_t local_nz,
+ const AddMonoid &addMonoid,
+ const AnyOp &anyOp
+ );
+
+ template<
+ Descriptor descr,
+ class AddMonoid,
+ class AnyOp,
+ typename InputType1,
+ typename InputType2,
+ typename Coords
+ >
+ RC boolean_dispatcher_sparse_dot_generic(
+ const bool already_dense_input_x,
+ const bool already_dense_input_y,
+ typename AddMonoid::D3 &thread_local_output,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ const Coords &local_x,
+ const Coords &local_y,
+ const Vector< InputType1, nonblocking, Coords > &x,
+ const Vector< InputType2, nonblocking, Coords > &y,
+ const size_t local_nz,
+ const AddMonoid &addMonoid,
+ const AnyOp &anyOp
+ ) {
+ if( already_dense_input_x ) {
+ if( already_dense_input_y ) {
+ return internal::sparse_dot_generic<
+ descr, true, true
+ >(
+ thread_local_output, lower_bound, upper_bound, local_x, local_y,
+ x, y, local_nz, addMonoid, anyOp
+ );
+ } else {
+ return internal::sparse_dot_generic<
+ descr, true, false
+ >(
+ thread_local_output, lower_bound, upper_bound, local_x, local_y,
+ x, y, local_nz, addMonoid, anyOp
+ );
+ }
+ } else {
+ if( already_dense_input_y ) {
+ return internal::sparse_dot_generic<
+ descr, false, true
+ >(
+ thread_local_output, lower_bound, upper_bound, local_x, local_y,
+ x, y, local_nz, addMonoid, anyOp
+ );
+ } else {
+ return internal::sparse_dot_generic<
+ descr, false, false
+ >(
+ thread_local_output, lower_bound, upper_bound, local_x, local_y,
+ x, y, local_nz, addMonoid, anyOp
+ );
+ }
+ }
+ }
+
+ } // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp b/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp
new file mode 100644
index 000000000..9897a2b0d
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_blas2.hpp
@@ -0,0 +1,190 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the level-2 primitives
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS2
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_BLAS2
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+
+
+namespace grb {
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool input_masked,
+ bool left_handed,
+ template< typename > class One,
+ bool already_dense_destination_vector,
+ bool already_dense_mask_vector,
+ class AdditiveMonoid,
+ class Multiplication,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename Coords,
+ typename RowColType,
+ typename NonzeroType
+ >
+ inline void vxm_inner_kernel_gather(
+ RC &rc,
+ const size_t lower_bound,
+ Coords &local_destination_vector,
+ const Coords &local_mask_vector,
+ Vector< IOType, nonblocking, Coords > &destination_vector,
+ IOType &destination_element,
+ const size_t &destination_index,
+ const Vector< InputType1, nonblocking, Coords > &source_vector,
+ const InputType1 * __restrict__ const &source,
+ const size_t &source_range,
+ const internal::Compressed_Storage<
+ InputType2, RowColType, NonzeroType
+ > &matrix,
+ const Vector< InputType3, nonblocking, Coords > &mask_vector,
+ const InputType3 * __restrict__ const &mask,
+ const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+ const InputType4 * __restrict__ const &source_mask,
+ const AdditiveMonoid &add,
+ const Multiplication &mul,
+ const std::function< size_t( size_t ) > &src_local_to_global,
+ const std::function< size_t( size_t ) > &src_global_to_local,
+ const std::function< size_t( size_t ) > &dst_local_to_global
+ );
+
+ template<
+ Descriptor descr,
+ bool masked,
+ bool input_masked,
+ bool left_handed,
+ template< typename > class One,
+ class AdditiveMonoid,
+ class Multiplication,
+ typename IOType,
+ typename InputType1,
+ typename InputType2,
+ typename InputType3,
+ typename InputType4,
+ typename Coords,
+ typename RowColType,
+ typename NonzeroType
+ >
+ inline void boolean_dispatcher_vxm_inner_kernel_gather(
+ const bool already_dense_destination_vector,
+ const bool already_dense_mask_vector,
+ RC &rc,
+ const size_t lower_bound,
+ Coords &local_destination_vector,
+ const Coords &local_mask_vector,
+ Vector< IOType, nonblocking, Coords > &destination_vector,
+ IOType &destination_element,
+ const size_t &destination_index,
+ const Vector< InputType1, nonblocking, Coords > &source_vector,
+ const InputType1 * __restrict__ const &source,
+ const size_t &source_range,
+ const internal::Compressed_Storage<
+ InputType2, RowColType, NonzeroType
+ > &matrix,
+ const Vector< InputType3, nonblocking, Coords > &mask_vector,
+ const InputType3 * __restrict__ const &mask,
+ const Vector< InputType4, nonblocking, Coords > &source_mask_vector,
+ const InputType4 * __restrict__ const &source_mask,
+ const AdditiveMonoid &add,
+ const Multiplication &mul,
+ const std::function< size_t( size_t ) > &src_local_to_global,
+ const std::function< size_t( size_t ) > &src_global_to_local,
+ const std::function< size_t( size_t ) > &dst_local_to_global
+ ) {
+ if( already_dense_destination_vector ) {
+ if( already_dense_mask_vector ) {
+ return internal::vxm_inner_kernel_gather<
+ descr, masked, input_masked, left_handed, One,
+ true, true
+ >(
+ rc, lower_bound, local_destination_vector, local_mask_vector,
+ destination_vector, destination_element, destination_index,
+ source_vector, source, source_range, matrix, mask_vector, mask,
+ source_mask_vector, source_mask, add, mul,
+ src_local_to_global, src_global_to_local, dst_local_to_global
+ );
+ } else {
+ return internal::vxm_inner_kernel_gather<
+ descr, masked, input_masked, left_handed, One,
+ true, false
+ >(
+ rc, lower_bound, local_destination_vector, local_mask_vector,
+ destination_vector, destination_element, destination_index,
+ source_vector, source, source_range, matrix, mask_vector, mask,
+ source_mask_vector, source_mask, add, mul,
+ src_local_to_global, src_global_to_local, dst_local_to_global
+ );
+ }
+ } else {
+ if( already_dense_mask_vector ) {
+ return internal::vxm_inner_kernel_gather<
+ descr, masked, input_masked, left_handed, One,
+ false, true
+ >(
+ rc, lower_bound, local_destination_vector, local_mask_vector,
+ destination_vector, destination_element, destination_index,
+ source_vector, source, source_range, matrix, mask_vector, mask,
+ source_mask_vector, source_mask, add, mul,
+ src_local_to_global, src_global_to_local, dst_local_to_global
+ );
+ } else {
+ return internal::vxm_inner_kernel_gather<
+ descr, masked, input_masked, left_handed, One,
+ false, false
+ >(
+ rc, lower_bound, local_destination_vector, local_mask_vector,
+ destination_vector, destination_element, destination_index,
+ source_vector, source, source_range, matrix, mask_vector, mask,
+ source_mask_vector, source_mask, add, mul,
+ src_local_to_global, src_global_to_local, dst_local_to_global
+ );
+ }
+ }
+ }
+
+ } // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/boolean_dispatcher_io.hpp b/include/graphblas/nonblocking/boolean_dispatcher_io.hpp
new file mode 100644
index 000000000..528d2cf4c
--- /dev/null
+++ b/include/graphblas/nonblocking/boolean_dispatcher_io.hpp
@@ -0,0 +1,361 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Dispatchers for the nonblocking i/o primitives.
+ *
+ * @author Aristeidis Mastoras
+ * @date 24th of October, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_IO
+#define _H_GRB_NONBLOCKING_BOOLEAN_DISPATCHER_IO
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "coordinates.hpp"
+#include "vector.hpp"
+#include "lazy_evaluation.hpp"
+
+
+namespace grb {
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool loop_over_vector_length,
+ bool already_dense_mask,
+ bool mask_is_dense,
+ typename DataType,
+ typename MaskType,
+ typename T,
+ typename Coords
+ >
+ RC masked_set(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ Vector< DataType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const T val
+ );
+
+ template<
+ Descriptor descr,
+ typename DataType,
+ typename MaskType,
+ typename T,
+ typename Coords
+ >
+ RC boolean_dispatcher_masked_set(
+ const bool loop_over_vector_length,
+ const bool already_dense_mask,
+ const bool mask_is_dense,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ Vector< DataType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const T val
+ ) {
+ if( loop_over_vector_length ) {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, true, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ } else {
+ return internal::masked_set<
+ descr, true, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, true, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ } else {
+ return internal::masked_set<
+ descr, true, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ }
+ }
+ } else {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, false, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ } else {
+ return internal::masked_set<
+ descr, false, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, false, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ } else {
+ return internal::masked_set<
+ descr, false, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, x, m, val );
+ }
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+ bool sparse,
+ bool already_dense_vectors,
+ bool already_dense_input,
+ typename OutputType,
+ typename InputType,
+ typename Coords
+ >
+ RC set_generic(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y
+ );
+
+ template< Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+ bool sparse,
+ typename OutputType,
+ typename InputType,
+ typename Coords
+ >
+ RC boolean_dispatcher_set_generic(
+ const bool already_dense_vectors,
+ const bool already_dense_input,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y
+ ) {
+ if( already_dense_vectors ) {
+ if( already_dense_input ) {
+ return internal::set_generic<
+ descr, out_is_void, in_is_void, sparse,
+ true, true
+ >( lower_bound, upper_bound, local_x, local_y, x, y );
+ } else {
+ return internal::set_generic<
+ descr, out_is_void, in_is_void, sparse,
+ true, false
+ >( lower_bound, upper_bound, local_x, local_y, x, y );
+ }
+ } else {
+ if( already_dense_input ) {
+ return internal::set_generic<
+ descr, out_is_void, in_is_void, sparse,
+ false, true
+ >( lower_bound, upper_bound, local_x, local_y, x, y );
+ } else {
+ return internal::set_generic<
+ descr, out_is_void, in_is_void, sparse,
+ false, false
+ >( lower_bound, upper_bound, local_x, local_y, x, y );
+ }
+ }
+ }
+
+ template<
+ Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+ bool loop_over_y,
+ bool already_dense_input_y,
+ bool already_dense_mask,
+ bool mask_is_dense,
+ typename OutputType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC masked_set(
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType, nonblocking, Coords > &y
+ );
+
+ template<
+ Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+ typename OutputType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC boolean_dispatcher_masked_set(
+ const bool loop_over_y,
+ const bool already_dense_input_y,
+ const bool already_dense_mask,
+ const bool mask_is_dense,
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType, nonblocking, Coords > &y
+ ) {
+ if( loop_over_y ) {
+ if( already_dense_input_y ) {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, true, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, true, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, true, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, true, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ }
+ } else {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, false, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, false, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, false, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ true, false, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ }
+ }
+ } else {
+ if( already_dense_input_y ) {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, true, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, true, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, true, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, true, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ }
+ } else {
+ if( already_dense_mask ) {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, false, true, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, false, true, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ } else {
+ if( mask_is_dense ) {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, false, false, true
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ } else {
+ return internal::masked_set<
+ descr, out_is_void, in_is_void,
+ false, false, false, false
+ >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y );
+ }
+ }
+ }
+ }
+ }
+
+ } // end namespace ``internal''
+
+} // end namespace ``grb''
+
+#endif
+
diff --git a/include/graphblas/nonblocking/collectives.hpp b/include/graphblas/nonblocking/collectives.hpp
new file mode 100644
index 000000000..b78c6a6e9
--- /dev/null
+++ b/include/graphblas/nonblocking/collectives.hpp
@@ -0,0 +1,91 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Collectives implementation for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_COLL
+#define _H_GRB_NONBLOCKING_COLL
+
+#include
+
+#include
+#include
+#include
+#include
+
+
+namespace grb {
+
+ /** The collectives class is based on that of the reference backend */
+ template<>
+ class collectives< nonblocking > {
+
+ private:
+
+ /** Disallow instantiation of this class. */
+ collectives() {}
+
+
+ public:
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator, typename IOType
+ >
+ static RC allreduce( IOType &inout, const Operator op = Operator() ) {
+ return collectives< reference >::allreduce< descr, Operator, IOType >(
+ inout, op );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ class Operator, typename IOType
+ >
+ static RC reduce(
+ IOType &inout, const size_t root = 0, const Operator op = Operator()
+ ) {
+ return collectives< reference >::reduce< descr, Operator, IOType >( inout,
+ root, op );
+ }
+
+ template< typename IOType >
+ static RC broadcast( IOType &inout, const size_t root = 0 ) {
+ return collectives< reference >::broadcast< IOType >( inout, root );
+ }
+
+ template< Descriptor descr = descriptors::no_operation, typename IOType >
+ static RC broadcast(
+ IOType * inout, const size_t size,
+ const size_t root = 0
+ ) {
+ return collectives< reference >::broadcast< descr, IOType >( inout, size,
+ root );
+ }
+
+ }; // end class `collectives< nonblocking >'
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_COLL''
+
diff --git a/include/graphblas/nonblocking/config.hpp b/include/graphblas/nonblocking/config.hpp
new file mode 100644
index 000000000..1ea6e4ab3
--- /dev/null
+++ b/include/graphblas/nonblocking/config.hpp
@@ -0,0 +1,205 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Configuration settings for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_CONFIG
+#define _H_GRB_NONBLOCKING_CONFIG
+
+#include
+#include
+
+
+namespace grb {
+
+ /**
+ * \defgroup nonblockingConfig Nonblocking backend configuration
+ *
+ * \ingroup config
+ *
+ * All configuration parameters for the #grb::nonblocking backend.
+ *
+ * @{
+ */
+
+ namespace config {
+
+ /**
+ * Configuration parameters relating to the pipeline data structure.
+ *
+ * \ingroup nonblockingConfig
+ */
+ class PIPELINE {
+
+ public:
+
+ /**
+ * How many independent pipelines any ALP algorithm may concurrently expose.
+ *
+ * The number of pipelines could exceed this maximum number. If this
+ * happens, and if #grb::config::PIPELINE::warn_if_exceeded is configured
+ * true , a warning will be output to the standard error stream.
+ */
+ static constexpr const size_t max_pipelines = 4;
+
+ /**
+ * Pipelines are constructed with default space for this many containers.
+ *
+ * The default is such that each underlying set used by the pipeline
+ * representation takes less than one kB space.
+ *
+ * Pipelines could exceed this maximum number of containers. If this
+ * happens, and if #grb::config::PIPELINE::warn_if_exceeded is configured
+ * true , a warning will be output to the standard error stream.
+ */
+ static constexpr const size_t max_containers = 16;
+
+ /**
+ * Pipelines are constructed with default space for this many stages.
+ *
+ * Pipelines could exceed this number of stages. If this happens, and if
+ * #grb::config::PIPELINE::warn_if_exceeded is configured true , a
+ * warning will be output to the standard error stream.
+ */
+ static constexpr const size_t max_depth = 16;
+
+ /**
+ * Pipelines are constructed with default space for this many tiles.
+ *
+ * Pipelines could exceed this number of tiles. If this happens, and if
+ *
+ * #grb::config::PIPELINE::warn_if_exceeded is configured true , a
+ * warning will be output to the standard error stream.
+ */
+ static constexpr const size_t max_tiles = 1 << 16;
+
+ /**
+ * Emit a warning to standard error stream if the default pipeline
+ * capacities are exceeded.
+ */
+ static constexpr const bool warn_if_exceeded = true;
+
+ /**
+ * When true , calling a fake nonblocking primitive for a first time
+ * will emit a warning to the standard error stream.
+ */
+ static constexpr const bool warn_if_not_native = true;
+
+ };
+
+ /**
+ * Configuration parameters relating to the analytic model employed by the
+ * nonblocking backend.
+ *
+ * \ingroup nonblockingConfig
+ */
+ class ANALYTIC_MODEL {
+
+ public:
+
+ /**
+ * The minimum tile size that may be automatically selected by the analytic
+ * model.
+ *
+ * A tile size that is set manually may be smaller than MIN_TILE_SIZE.
+ */
+ static constexpr const size_t MIN_TILE_SIZE = 512;
+
+ /**
+ * The L1 cache size is assumed to be a bit smaller than the actual size to
+ * take into account any data that may be stored in cache and are not
+ * considered by the analytic model, e.g., matrices for the current design.
+ */
+ static constexpr const double L1_CACHE_USAGE_PERCENTAGE = 0.98;
+
+ };
+
+ /**
+ * Implementation-dependent configuration parameters for the \a nonblocking
+ * backend.
+ *
+ * \note The user documentation only specifies the fields that under some
+ * circumstances may benefit from a user adapting it. For viewing all
+ * fields, please see the developer documentation.
+ *
+ * \note Adapting the fields should be done with care and may require
+ * re-compilation and re-installation of the ALP framework.
+ *
+ * \ingroup nonblockingConfig
+ *
+ * @see grb::config::IMPLEMENTATION
+ */
+ template<>
+ class IMPLEMENTATION< nonblocking > {
+
+ public:
+
+ /**
+ * A private memory segment shall never be accessed by threads other than
+ * the thread who allocates it. Therefore we choose aligned mode here.
+ */
+ static constexpr ALLOC_MODE defaultAllocMode() {
+ return ALLOC_MODE::ALIGNED;
+ }
+
+ /**
+ * For the nonblocking backend, a shared memory-segment should use
+ * interleaved alloc so that any thread has uniform access on average.
+ */
+ static constexpr ALLOC_MODE sharedAllocMode() {
+ return ALLOC_MODE::INTERLEAVED;
+ }
+
+ /**
+ * \internal
+ * By default, use the coordinates of the selected backend.
+ *
+ * \note This is an extension that may, at some later stage, be used for
+ * composability with the #grb::bsp1d and #grb::hybrid backends.
+ * \endinternal
+ */
+ static constexpr Backend coordinatesBackend() {
+ return nonblocking;
+ }
+
+ /**
+ * \internal
+ * Whether the backend has vector capacities always fixed to their
+ * defaults.
+ * \endinternal
+ */
+ static constexpr bool fixedVectorCapacities() {
+ return true;
+ }
+
+ };
+
+ } // namespace config
+
+ /** @} */
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_CONFIG''
+
diff --git a/include/graphblas/nonblocking/coordinates.hpp b/include/graphblas/nonblocking/coordinates.hpp
new file mode 100644
index 000000000..bcb4cf42a
--- /dev/null
+++ b/include/graphblas/nonblocking/coordinates.hpp
@@ -0,0 +1,701 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Coordinates for the nonblocking backend
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_COORDINATES
+#define _H_GRB_NONBLOCKING_COORDINATES
+
+#include //std::runtime_error
+#include
+#if defined _DEBUG && ! defined NDEBUG
+ #include
+#endif
+
+#include //size_t
+#include
+
+#include
+#include
+#include
+
+#include
+
+#include
+
+#include
+
+#include
+#include
+
+
+namespace grb {
+
+ namespace internal {
+
+ /**
+ * The Coordinates class is based on that of the reference backend.
+ * A set of new methods is added to handle local coordinates used
+ * by the nonblocking backend. The bufferSize method used by the
+ * Matrix class relies on parbufSize and prefixbufSize that have
+ * their own implementation for the nonblocking backend.
+ */
+ template<>
+ class Coordinates< nonblocking > {
+
+ public:
+
+ typedef typename config::VectorIndexType StackType;
+
+ typedef bool ArrayType;
+
+
+ private:
+
+ bool * __restrict__ _assigned;
+
+ StackType * __restrict__ _stack;
+
+ StackType * __restrict__ _buffer;
+
+ size_t _n;
+
+ size_t _cap;
+
+ size_t _buf;
+
+ // pointers to the data of the local coordinates mechanism
+ std::vector< config::VectorIndexType * > local_buffer;
+ config::VectorIndexType * __restrict__ local_new_nnzs;
+ config::VectorIndexType * __restrict__ pref_sum;
+
+ // the analytic model used during the execution of a pipeline
+ AnalyticModel analytic_model;
+
+
+ public:
+
+ static inline size_t arraySize( const size_t dim ) noexcept {
+ if( dim == 0 ) {
+ return 0;
+ }
+ return ( dim + 1 ) * sizeof( ArrayType );
+ }
+
+ static inline size_t stackSize( const size_t dim ) noexcept {
+ if( dim == 0 ) {
+ return 0;
+ }
+ return ( dim + 1 ) * sizeof( StackType );
+ }
+
+ static inline size_t prefixbufSize() noexcept {
+ int P = 1;
+ return ( P + 1 ) * sizeof( StackType );
+ }
+
+ static inline size_t parbufSize( const size_t n ) noexcept {
+ return internal::NONBLOCKING::vectorBufferSize( n ) * sizeof( StackType );
+ }
+
+ static inline size_t bufferSize( const size_t dim ) noexcept {
+ size_t ret = stackSize( dim );
+ ret += parbufSize( dim );
+ ret += prefixbufSize();
+ return ret;
+ }
+
+ inline Coordinates() noexcept :
+ _assigned( nullptr ), _stack( nullptr ), _buffer( nullptr ),
+ _n( 0 ), _cap( 0 ), _buf( 0 )
+ {}
+
+ inline Coordinates( Coordinates< nonblocking > &&x ) noexcept :
+ _assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+ _n( x._n ), _cap( x._cap ), _buf( x._buf )
+ {
+ x._assigned = nullptr;
+ x._stack = nullptr;
+ x._buffer = nullptr;
+ x._n = x._cap = x._buf = 0;
+ }
+
+ inline Coordinates( const Coordinates< nonblocking > &x ) noexcept :
+ _assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ),
+ _n( x._n ), _cap( x._cap ), _buf( x._buf )
+ {
+ assert( this != &x );
+ }
+
+ inline Coordinates< nonblocking > & operator=(
+ const Coordinates< nonblocking > &other
+ ) {
+ Coordinates replace( other );
+ *this = std::move( replace );
+ return *this;
+ }
+
+ inline Coordinates< nonblocking > & operator=(
+ Coordinates< nonblocking > &&x
+ ) noexcept {
+ assert( this != &x );
+ _assigned = x._assigned;
+ _stack = x._stack;
+ _buffer = x._buffer;
+ _n = x._n;
+ _cap = x._cap;
+ _buf = x._buf;
+ x._assigned = nullptr;
+ x._stack = x._buffer = nullptr;
+ x._n = x._cap = x._buf = 0;
+ return *this;
+ }
+
+ inline ~Coordinates() noexcept {
+ // done (the #_assigned and #_stack memory
+ // blocks are not managed by this class)
+ }
+
+ void set(
+ void * const arr, bool arr_initialized,
+ void * const buf, const size_t dim, bool parallel = true
+ ) noexcept {
+ // catch trivial case
+ if( arr == nullptr || buf == nullptr ) {
+ assert( arr == nullptr );
+ assert( buf == nullptr );
+ assert( dim == 0 );
+ _assigned = nullptr;
+ _stack = nullptr;
+ _buffer = nullptr;
+ _n = 0;
+ _cap = 0;
+ _buf = 0;
+ return;
+ }
+
+ // _assigned has no alignment issues, take directly from input buffer
+ assert( reinterpret_cast< uintptr_t >( _assigned ) % sizeof( bool ) == 0 );
+ _assigned = static_cast< bool * >( arr );
+ // ...but _stack does have potential alignment issues:
+ char * buf_raw = static_cast< char * >( buf );
+ constexpr const size_t size = sizeof( StackType );
+ const size_t mod = reinterpret_cast< uintptr_t >( buf_raw ) % size;
+ if( mod != 0 ) {
+ buf_raw += size - mod;
+ }
+ _stack = reinterpret_cast< StackType * >( buf_raw );
+ // no alignment issues between stack and buffer, so just shift by dim:
+ _buffer = _stack + dim;
+ // initialise
+ _n = 0;
+ _cap = dim;
+ _buf = internal::NONBLOCKING::vectorBufferSize( _cap );
+
+ // and initialise _assigned (but only if necessary)
+ if( dim > 0 && !arr_initialized ) {
+ if( parallel ) {
+ #pragma omp parallel
+ {
+ size_t start, end;
+ config::OMP::localRange( start, end, 0, dim );
+ for( size_t i = start; i < end; ++i ) {
+ _assigned[ i ] = false;
+ }
+ }
+ } else {
+ for( size_t i = 0; i < dim; ++i ) {
+ _assigned[ i ] = false;
+ }
+ }
+ }
+ }
+
+ inline bool assign( const size_t i ) noexcept {
+ if( _n == _cap ) {
+ return true;
+ }
+ if( !_assigned[ i ] ) {
+ _assigned[ i ] = true;
+ const size_t newSize = _n + 1;
+ assert( _n <= _cap );
+ assert( newSize <= _cap );
+ _stack[ _n ] = i;
+ _n = newSize;
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ template< bool maybe_invalid = false >
+ inline void local_assignAll( ) noexcept {
+ if( maybe_invalid || _n != _cap ) {
+ if( _assigned != nullptr ) {
+ assert( _stack != nullptr );
+ assert( maybe_invalid || _n < _cap );
+ assert( !maybe_invalid || _n <= _cap );
+ _n = _cap;
+
+ for( size_t i = 0; i < _n; ++i ) {
+ _assigned[ i ] = true;
+ _stack[ i ] = i;
+ }
+ }
+ }
+
+ // the counter of initial nonzeroes in the local stack is stored in the
+ // buffer immediately before the local stack
+ StackType * __restrict__ local_nnzs = _stack - 1;
+
+ // the counter for the local stack must be set to zero such that the number
+ // of new nonzeroes will be set to _n by asyncJoinSubset and joinSubset
+ // will update the global stack based on the local_new_nnzs counter the
+ // global stack has become empty and _assigned = false so the local
+ // coordinates of this tile must be added in the global stack from scratch
+ // regardless whether this tile was already dense or not as it is hard to
+ // know which part of the global stack contains the coordinates of this
+ // tile
+ *local_nnzs = 0;
+ }
+
+ template< bool maybe_invalid = false >
+ inline void local_assignAllNotAlreadyAssigned( ) noexcept {
+ if( maybe_invalid || _n != _cap ) {
+ if( _assigned != nullptr ) {
+ assert( _stack != nullptr );
+ assert( maybe_invalid || _n < _cap );
+ assert( !maybe_invalid || _n <= _cap );
+
+ // searching for the not already assigned elements and add them to the
+ // local stack such that joinSubset will add to the global stack only
+ // those elements that are not already assigned
+ for( size_t i = 0; i < _cap; ++i ) {
+ if( !_assigned[ i ] ) {
+ _assigned[ i ] = true;
+ _stack[ _n++ ] = i;
+ }
+ }
+
+ assert( _n == _cap );
+ }
+ }
+ }
+
+ inline void clear() noexcept {
+
+ if( _n == _cap ) {
+#ifndef NDEBUG
+ if( _assigned == nullptr && _cap > 0 ) {
+ const bool dense_coordinates_may_not_call_clear = false;
+ assert( dense_coordinates_may_not_call_clear );
+ }
+#endif
+
+ #pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+ for( size_t i = 0; i < _cap; ++i ) {
+ _assigned[ i ] = false;
+ }
+ } else {
+ if( _n < config::OMP::minLoopSize() ) {
+ for( size_t k = 0; k < _n; ++k ) {
+ _assigned[ _stack[ k ] ] = false;
+ }
+ } else {
+ #pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() )
+ for( size_t k = 0; k < _n; ++k ) {
+ _assigned[ _stack[ k ] ] = false;
+ }
+ }
+ }
+ _n = 0;
+ }
+
+ inline void local_clear() noexcept {
+
+ if( _n == _cap ) {
+#ifndef NDEBUG
+ if( _assigned == nullptr && _cap > 0 ) {
+ const bool dense_coordinates_may_not_call_clear = false;
+ assert( dense_coordinates_may_not_call_clear );
+ }
+#endif
+
+ for( size_t i = 0; i < _cap; ++i ) {
+ _assigned[ i ] = false;
+ }
+ } else {
+ for( size_t k = 0; k < _n; ++k ) {
+ _assigned[ _stack[ k ] ] = false;
+ }
+ }
+ _n = 0;
+
+ // the counter of initial nonzeroes in the local stack is stored in the
+ // buffer immediately before the local stack
+ StackType * __restrict__ local_nnzs = _stack - 1;
+
+ // the counter for the local stack must be set to zero such that any new
+ // assigned element will be written to the global stack
+ *local_nnzs = 0;
+ }
+
+ inline void reset_global_nnz_counter() noexcept {
+ _n = 0;
+ }
+
+ inline bool isEmpty() const noexcept {
+ if( _n == 0 ) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ inline bool isDense() const noexcept {
+ return _n == _cap;
+ }
+
+ inline size_t size() const noexcept {
+ return _cap;
+ }
+
+ inline bool assigned( const size_t i ) const noexcept {
+ assert( i < _cap );
+ return _n == _cap || _assigned[ i ];
+ }
+
+ template< Descriptor descr, typename T >
+ inline bool mask( const size_t i, const T * const val ) const noexcept {
+ assert( i < _cap );
+ return utils::interpretMask< descr >( assigned( i ), val, i );
+ }
+
+ inline size_t nonzeroes() const noexcept {
+ assert( _n <= _cap );
+ return _n;
+ }
+
+ inline size_t index( const size_t k ) const noexcept {
+ assert( k < _n );
+ return isDense() ? k : _stack[ k ];
+ }
+
+ void localCoordinatesInit( const AnalyticModel &am ) {
+
+ analytic_model = am;
+
+ const size_t nthreads = analytic_model.getNumThreads();
+ const size_t tile_size = analytic_model.getTileSize();
+ const size_t num_tiles = analytic_model.getNumTiles();
+
+ assert( num_tiles > 0 );
+ assert( num_tiles <= internal::NONBLOCKING::maxBufferTiles( _cap ) );
+ assert( _buf >= 4 * num_tiles );
+
+ local_buffer.resize( analytic_model.getNumTiles() );
+
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads)
+ for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) {
+ local_buffer[ tile_id ] = _buffer + tile_id * ( tile_size + 1 );
+ }
+
+ local_new_nnzs = _buffer + num_tiles * ( tile_size + 1 );
+ pref_sum = _buffer + num_tiles * ( tile_size + 2 );
+ }
+
+ /**
+ * Initialises a Coordinate instance that refers to a subset of this
+ * coordinates instance. Multiple disjoint subsets may be retrieved
+ * and concurrently updated, up to a maximum of tiles given by
+ * #internal::NONBLOCKING::maxBufferTiles().
+ *
+ * Subsets must be contiguous. If one thread calls this function, all
+ * other threads must make a matching call.
+ *
+ * @param[in] lower_bound The start index of the contiguous subset
+ * (inclusive).
+ * @param[in] upper_bound The end index of the contiguous subset
+ * (exclusive).
+ */
+ void asyncSubsetInit(
+ const size_t lower_bound,
+ const size_t upper_bound
+ ) noexcept {
+ if( _cap == 0 ) {
+ return;
+ }
+
+ const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+ config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+ config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+ *local_nnzs = 0;
+ if( upper_bound - lower_bound < _n ) {
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ if( _assigned[ i ] ) {
+ local_stack[ (*local_nnzs)++ ] = i - lower_bound;
+ }
+ }
+ } else {
+ for( size_t i = 0; i < _n; ++i ) {
+ const size_t k = _stack[ i ];
+ if( lower_bound <= k && k < upper_bound ) {
+ assert( _assigned[ k ] );
+ local_stack[ (*local_nnzs)++ ] = k - lower_bound;
+ }
+ }
+ }
+
+ // the number of new nonzeroes is initialized here
+ local_new_nnzs[ tile_id ] = 0;
+ }
+
+ /**
+ * Retrieves a subset coordinate instance that was previously initialised
+ * using a call to #asyncSubsetInit.
+ *
+ * @returns A Coordinates instance that only supports sequential
+ * (synchronous) updates as well as all queries.
+ */
+ Coordinates< nonblocking > asyncSubset(
+ const size_t lower_bound, const size_t upper_bound
+ ) const noexcept {
+ assert(_cap > 0);
+
+ const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+ config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+ config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+ Coordinates< nonblocking > ret;
+ assert( upper_bound - lower_bound <= analytic_model.getTileSize() );
+
+ ret.set( _assigned + lower_bound, true, local_stack,
+ upper_bound - lower_bound, false );
+
+ // the number of new nonzeroes is used to determine the total number
+ // of nonzeroes for the given local coordinates, since some of the
+ // nonzeroes are already written on the local statck
+ ret._n = (*local_nnzs) + local_new_nnzs[ tile_id ];
+ assert( ret._n <= ret._cap );
+
+ ret._buf = 0;
+
+ return ret;
+ }
+
+ /**
+ * Saves the state of a subset Coordinates instance. Can be retrieved later
+ * once again via a call to #asyncSubset. New nonzeroes will be committed
+ * to the global coordinate structure via a call to #joinSubset, which will
+ * furthermore set the related tile to inactive.
+ */
+ void asyncJoinSubset(
+ const Coordinates< nonblocking > &subset,
+ const size_t lower_bound, const size_t upper_bound
+ ) {
+ assert( _cap > 0 );
+
+ (void) upper_bound;
+
+ const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+ config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+
+ assert( subset._n <= subset._cap );
+ assert( (*local_nnzs) <= subset._cap );
+
+ local_new_nnzs[ tile_id ] = subset._n - (*local_nnzs);
+ }
+
+ bool newNonZeroes() const {
+
+ if( _cap == 0 ) {
+ return false;
+ }
+
+ const size_t num_tiles = analytic_model.getNumTiles();
+
+ for( size_t i = 0; i < num_tiles; i++ ) {
+ if( local_new_nnzs[ i ] > 0 ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void prefixSumComputation() {
+
+ const size_t num_tiles = analytic_model.getNumTiles();
+
+ // takes into accout the size of data for each iteration of the prefix sum
+ // computation which is used to determine the number of parallel task that
+ // should be used such that the data of each parallel task fit in the L1
+ // cache
+ constexpr size_t size_of_data = sizeof( pref_sum[0] ) +
+ sizeof( local_new_nnzs[0] );
+
+ // make use of the analytic model to estimate a proper number of threads
+ // and a tile size
+ AnalyticModel am( size_of_data, num_tiles, 1 );
+
+ const size_t nthreads = am.getNumThreads();
+ const size_t prefix_sum_tile_size = am.getTileSize();
+ const size_t prefix_sum_num_tiles = am.getNumTiles();
+
+ // make a run-time decision to choose between sequential and parallel
+ // prefix sum implementation the sequential prefix sum implementation is
+ // more efficient for a small number of tiles
+ if( num_tiles < prefix_sum_tile_size ) {
+ // sequential computation of the prefix sum
+ pref_sum[ 0 ] = _n + local_new_nnzs[ 0 ];
+ for( size_t i = 1; i < num_tiles; i++ ) {
+ pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+ }
+ } else {
+ // parallel computation of the prefix sum
+ size_t local_prefix_sum[ prefix_sum_num_tiles ];
+
+ #pragma omp parallel num_threads(nthreads)
+ {
+ #pragma omp for
+ for( size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+ size_t lower, upper;
+ config::OMP::localRange( lower, upper, 0, num_tiles,
+ prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+ // the number of threads used for parallel computation must not exceed
+ // num_tiles, otherwise the code below results in data races
+ assert( id <= num_tiles );
+ assert( id < prefix_sum_num_tiles - 1 || upper == num_tiles );
+ assert( lower <= upper );
+ assert( upper <= num_tiles );
+
+ pref_sum[ lower ] = local_new_nnzs[ lower ];
+ for( size_t i = lower + 1; i < upper; i++ ) {
+ pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ];
+ }
+
+ // each thread stores the prefix sum of its last element in
+ // local_prefix_sum
+ // the memory location is specified by the identifier of the thread to
+ // avoid data races
+ local_prefix_sum[ id ] = pref_sum[ upper - 1 ];
+ }
+
+ // here, there is an implicit barrier that ensures all threads have
+ // already written the local prefix sum for each parallel task
+
+ // a single threads computes the prefix sum for the last element of each
+ // thread
+ #pragma omp single
+ {
+ for( size_t i = 1; i < prefix_sum_num_tiles; i++ ) {
+ local_prefix_sum[ i ] += local_prefix_sum[ i - 1 ];
+ }
+ }
+
+ #pragma omp for
+ for(size_t id = 0; id < prefix_sum_num_tiles; id++ ) {
+
+ size_t lower, upper;
+ config::OMP::localRange( lower, upper, 0, num_tiles,
+ prefix_sum_tile_size, id, prefix_sum_num_tiles );
+
+ // the first thread (id=0) needs to add only the number of nonzeroes(_n)
+ const size_t acc = _n + ( ( id > 0 ) ? local_prefix_sum[ id - 1 ] : 0 );
+ for( size_t i = lower; i < upper; i++ ) {
+ pref_sum[ i ] += acc;
+ }
+ }
+ }
+
+#ifdef _DEBUG
+ // ensures that the parallel implementation computes the same result
+ // with the following sequential implementation
+ size_t seq_offsets[ num_tiles ];
+ seq_offsets[ 0 ] = _n + local_new_nnzs[ 0 ];
+ for( size_t i = 1; i < num_tiles; i++ ) {
+ seq_offsets[ i ] = seq_offsets[ i - 1 ] + local_new_nnzs[ i ];
+ }
+
+ for( size_t i = 0; i < num_tiles; i++ ) {
+ assert( seq_offsets[i] == pref_sum[i] );
+ }
+#endif
+ }
+
+ // a single thread updates the number of nonzeroes
+ // the last element of prefix_sum_ofssets alredy includes
+ // the current number of nonzeroes _n which was added earlier
+ _n = pref_sum[ num_tiles - 1 ];
+ }
+
+ /**
+ * Takes a currently active subset and commits it to the global storage.
+ * After completion the given active tile will be marked inactive.
+ */
+ void joinSubset( const size_t lower_bound, const size_t upper_bound ) {
+ if( _cap == 0 ) {
+ return;
+ }
+#ifdef NDEBUG
+ ( void )upper_bound;
+#endif
+ const size_t tile_id = lower_bound / analytic_model.getTileSize();
+
+ config::VectorIndexType *local_nnzs = local_buffer[ tile_id ];
+ config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1;
+
+ const size_t local_stack_start = *local_nnzs;
+ const size_t local_stack_end = *local_nnzs + local_new_nnzs[ tile_id ];
+ assert( local_stack_start <= local_stack_end );
+
+ size_t pos = pref_sum[ tile_id ] - local_new_nnzs[ tile_id ];
+
+ for( size_t k = local_stack_start; k < local_stack_end; ++k ) {
+ const size_t local_index = local_stack[ k ];
+ const size_t global_index = local_index + lower_bound;
+
+ assert( global_index >= lower_bound );
+ assert( global_index < upper_bound );
+ assert( _assigned[ global_index ] );
+ assert( pos < _cap );
+
+ _stack[ pos++ ] = global_index;
+ }
+
+ local_new_nnzs[ tile_id ] = 0;
+ }
+ };
+
+ } // namespace internal
+
+} // namespace grb
+
+#endif // end `_H_GRB_NONBLOCKING_COORDINATES'
+
diff --git a/include/graphblas/nonblocking/exec.hpp b/include/graphblas/nonblocking/exec.hpp
new file mode 100644
index 000000000..09f679526
--- /dev/null
+++ b/include/graphblas/nonblocking/exec.hpp
@@ -0,0 +1,104 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Implements the launcher for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_EXEC
+#define _H_GRB_NONBLOCKING_EXEC
+
+#include
+#include
+
+#include "init.hpp"
+
+
+namespace grb {
+
+ /** The Launcher class is based on that of the reference backend */
+ template< EXEC_MODE mode >
+ class Launcher< mode, nonblocking > {
+
+ private:
+
+ Launcher< mode, reference > ref;
+
+ public:
+
+ /**
+ * This implementation only accepts a single user process. It ignores
+ * \a hostname and \a port.
+ */
+ Launcher(
+ const size_t process_id = 0,
+ const size_t nprocs = 1,
+ const std::string hostname = "localhost",
+ const std::string port = "0"
+ ) {
+ // ignore hostname and port
+ (void) hostname;
+ (void) port;
+ // sanity checks
+ if( nprocs != 1 ) {
+ throw std::invalid_argument( "Total number of user processes must be "
+ "exactly one when using the nonblocking implementation."
+ );
+ }
+ if( process_id != 0 ) {
+ throw std::invalid_argument( "Process ID must always be zero in the "
+ "nonblocking implementation."
+ );
+ }
+ }
+
+ /** No implementation notes. */
+ ~Launcher() {}
+
+ /** exec is based on that of the reference backend */
+ template< typename U >
+ RC exec(
+ void ( *grb_program )( const void *, const size_t, U & ),
+ const void * data_in, const size_t in_size,
+ U &data_out, const bool broadcast = false
+ ) const {
+ return ref.exec( grb_program, data_in, in_size, data_out, broadcast );
+ }
+
+ /** exec is based on that of the reference backend */
+ template< typename T, typename U >
+ RC exec(
+ void ( *grb_program )( const T &, U & ),
+ const T &data_in, U &data_out,
+ const bool broadcast = false
+ ) {
+ return ref.exec( grb_program, data_in, data_out, broadcast );
+ }
+
+ /** finalize is based on that of the reference backend */
+ grb::RC finalize() { return ref.finalize(); }
+ };
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_EXEC''
+
diff --git a/include/graphblas/nonblocking/forward.hpp b/include/graphblas/nonblocking/forward.hpp
new file mode 100644
index 000000000..0baeee5be
--- /dev/null
+++ b/include/graphblas/nonblocking/forward.hpp
@@ -0,0 +1,51 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Forward declarations required by the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_FORWARD
+#define _H_GRB_NONBLOCKING_FORWARD
+
+
+namespace grb {
+
+ // The eWiseLambda is a friend of matrix but defined in blas2. Therefore it is
+ // forward-declared and this forward definition file included from both
+ // matrix.hpp and blas2.hpp
+ template<
+ class ActiveDistribution = internal::Distribution< nonblocking >,
+ typename Func, typename DataType,
+ typename RIT, typename CIT, typename NIT
+ >
+ RC eWiseLambda(
+ const Func f,
+ const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A,
+ const size_t s = 0, const size_t P = 1
+ );
+ // end eWiseLambda declarations
+
+} // namespace grb
+
+#endif // end ``_H_GRB_NONBLOCKING_FORWARD''
+
diff --git a/include/graphblas/nonblocking/init.hpp b/include/graphblas/nonblocking/init.hpp
new file mode 100644
index 000000000..e01b17e70
--- /dev/null
+++ b/include/graphblas/nonblocking/init.hpp
@@ -0,0 +1,147 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the initialisation and finalisation routines for the nonblocking
+ * backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_INIT
+#define _H_GRB_NONBLOCKING_INIT
+
+#include
+#include
+
+
+namespace grb {
+
+ template<>
+ RC init< nonblocking >( const size_t, const size_t, void * const );
+
+ template<>
+ RC finalize< nonblocking >();
+
+ namespace internal {
+
+ /** Internal state of the nonblocking backend. */
+ class NONBLOCKING {
+
+ friend RC init< nonblocking >( const size_t, const size_t, void * const );
+
+ private:
+
+ /**
+ * Determines whether the tile size is automatically selected by the
+ * analytic model or whether it is manually selected by the user with the
+ * environment variable GRB_NONBLOCKING_TILE_SIZE.
+ */
+ static bool manual_tile_size;
+
+ /**
+ * The tile size that is manually selected by the user and is initialized in
+ * init.cpp. This variable is only set when the GRB_NONBLOCKING_TILE_SIZE
+ * environment variable is defined, and if so, this variable equal its
+ * content.
+ */
+ static size_t manual_fixed_tile_size;
+
+ /**
+ * The maximum number of threads available in the system that may be set
+ * with the environment variable OMP_NUM_THREADS.
+ */
+ static size_t num_threads;
+
+
+ public:
+
+ /**
+ * When true , calling a fake nonblocking primitive for a first time
+ * will emit a warning to the standard error stream.
+ */
+ static bool warn_if_not_native;
+
+ /**
+ * The number of individual buffers that a vector should be able to
+ * concurrently maintain.
+ *
+ * @param[in] n The vector size.
+ *
+ * @returns The number of individual buffers that should be supported.
+ */
+ static inline size_t maxBufferTiles( const size_t n ) {
+ return n;
+ }
+
+ /**
+ * Helper function that computes the effective buffer size for a vector
+ * of \a n elements by taking into account the space required for storing
+ * the local stack size, the number of new nonzeroes, and the offset used
+ * for the prefix-sum algorithm
+ *
+ * @param[in] n The size of the vector.
+ * @param[in] T The maximum number of tiles that need be supported.
+ *
+ * @returns The buffer size given the vector size, maximum number of
+ * tiles, and the requested configuration.
+ */
+ static inline size_t vectorBufferSize( const size_t n ) {
+ const size_t T = maxBufferTiles( n );
+ size_t ret = n;
+
+ // +1 for storing the local stack size
+ // +1 for storing the number of new nonzeroes
+ // +1 for storing the offset used for the prefix-sum algorithm
+ ret += 3 * T;
+ ret = std::max( 4 * T, ret );
+
+ return ret;
+ }
+
+ /**
+ * Whether the tile size is manually set by the user or not.
+ */
+ static bool isManualTileSize() {
+ return manual_tile_size;
+ }
+
+ /**
+ * The tile size that is manually selected by the user.
+ */
+ static size_t manualFixedTileSize() {
+ return manual_fixed_tile_size;
+ }
+
+ /**
+ * The maximum number of threads available in the system.
+ */
+ static size_t numThreads() {
+ return num_threads;
+ }
+
+ };
+
+ }
+
+} // namespace grb
+
+#endif //``end _H_GRB_NONBLOCKING_INIT''
+
diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp
new file mode 100644
index 000000000..44b7f3a4d
--- /dev/null
+++ b/include/graphblas/nonblocking/io.hpp
@@ -0,0 +1,1350 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the I/O primitives for the nonblocking backend.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_IO
+#define _H_GRB_NONBLOCKING_IO
+
+#include
+#include
+#include
+
+#include "lazy_evaluation.hpp"
+#include "boolean_dispatcher_io.hpp"
+
+#define NO_CAST_ASSERT( x, y, z ) \
+ static_assert( x, \
+ "\n\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* ERROR | " y " " z ".\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" \
+ "* Possible fix 1 | Remove no_casting from the template parameters " \
+ "in this call to " y ".\n" \
+ "* Possible fix 2 | Provide a value input iterator with element " \
+ "types that match the output vector element type.\n" \
+ "* Possible fix 3 | If applicable, provide an index input iterator " \
+ "with element types that are integral.\n" \
+ "********************************************************************" \
+ "********************************************************************" \
+ "******************************\n" );
+
+
+namespace grb {
+
+ namespace internal {
+
+ extern LazyEvaluation le;
+
+ }
+
+}
+
+namespace grb {
+
+ /**
+ * \defgroup IO Data Ingestion -- nonblocking backend
+ * @{
+ */
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ uintptr_t getID( const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A ) {
+ return getID( internal::getRefMatrix( A ) );
+ }
+
+ template< typename DataType, typename Coords >
+ size_t size( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+ return internal::getCoordinates( x ).size();
+ }
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ size_t nrows(
+ const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+ ) noexcept {
+ return nrows( internal::getRefMatrix( A ) );
+ }
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ size_t ncols(
+ const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+ ) noexcept {
+ return ncols( internal::getRefMatrix( A ) );
+ }
+
+ template< typename DataType, typename Coords >
+ size_t nnz( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+ internal::le.execution( &x );
+ return internal::getCoordinates( x ).nonzeroes();
+ }
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ size_t nnz(
+ const Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+ ) noexcept {
+ return nnz( internal::getRefMatrix( A ) );
+ }
+
+ template< typename DataType, typename Coords >
+ size_t capacity( const Vector< DataType, nonblocking, Coords > &x ) noexcept {
+ return internal::getCoordinates( x ).size();
+ }
+
+ template< typename DataType, typename RIT, typename CIT, typename NIT >
+ size_t capacity(
+ const Matrix< DataType, nonblocking, RIT, CIT, NIT > &A
+ ) noexcept {
+ return capacity( internal::getRefMatrix( A ) );
+ }
+
+ template< typename DataType, typename Coords >
+ RC clear( Vector< DataType, nonblocking, Coords > &x ) noexcept {
+ internal::le.execution( &x );
+ internal::getCoordinates( x ).clear();
+ return SUCCESS;
+ }
+
+ template< typename InputType, typename RIT, typename CIT, typename NIT >
+ RC clear(
+ Matrix< InputType, nonblocking, RIT, CIT, NIT > &A
+ ) noexcept {
+ return clear( internal::getRefMatrix( A ) );
+ }
+
+ template<
+ typename InputType,
+ typename Coords
+ >
+ RC resize(
+ Vector< InputType, nonblocking, Coords > &x,
+ const size_t new_nz
+ ) noexcept {
+ internal::le.execution( &x );
+#ifdef _DEBUG
+ std::cerr << "In grb::resize (vector, nonblocking)\n";
+#endif
+ // this cannot wait until after the below check, as the spec defines that
+ // anything is OK for an empty vector
+ if( new_nz == 0 ) {
+ return grb::clear( x );
+ }
+
+ // check if we have a mismatch
+ if( new_nz > grb::size( x ) ) {
+#ifdef _DEBUG
+ std::cerr << "\t requested capacity of " << new_nz << ", "
+ << "expected a value smaller than or equal to "
+ << size( x ) << "\n";
+#endif
+ return ILLEGAL;
+ }
+
+ // in the nonblocking implementation, vectors are of static size
+ // so this function immediately succeeds. However, all existing contents
+ // must be removed
+ return grb::clear( x );
+ }
+
+ template<
+ typename InputType,
+ typename RIT,
+ typename CIT,
+ typename NIT
+ >
+ RC resize(
+ Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+ const size_t new_nz
+ ) noexcept {
+ return resize( internal::getRefMatrix( A ), new_nz );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType,
+ typename T,
+ typename Coords
+ >
+ RC set(
+ Vector< DataType, nonblocking, Coords > &x,
+ const T val,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value &&
+ !grb::is_object< T >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< DataType, T >::value
+ ), "grb::set (Vector, unmasked)",
+ "called with a value type that does not match that of the given vector"
+ );
+
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ // pre-cast value to be copied
+ const DataType toCopy = static_cast< DataType >( val );
+ DataType * const raw = internal::getRaw( x );
+ const size_t n = internal::getCoordinates( x ).size();
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&x, toCopy, raw] (
+ internal::Pipeline &pipeline,
+ size_t lower_bound, size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage set(x, val) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ bool already_dense_output = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( x ) );
+ if( !already_dense_output ) {
+#endif
+ Coords local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+
+ local_x.local_assignAllNotAlreadyAssigned();
+ assert( local_x.nonzeroes() == local_x.size() );
+
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ for( size_t i = lower_bound; i < upper_bound; i++ ) {
+ raw[ i ] = internal::template ValueOrIndex<
+ descr, DataType, DataType
+ >::getFromScalar( toCopy, i );
+ }
+
+ return SUCCESS;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::IO_SET_SCALAR,
+ n, sizeof( DataType ), dense_descr, true,
+ &x, nullptr,
+ &internal::getCoordinates( x ), nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: SET(x, val)" << std::endl;
+#endif
+ return ret;
+ }
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool loop_over_vector_length,
+ bool already_dense_mask,
+ bool mask_is_dense,
+#endif
+ typename DataType,
+ typename MaskType,
+ typename T,
+ typename Coords
+ >
+ RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool loop_over_vector_length,
+ bool already_dense_mask,
+ bool mask_is_dense,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ Vector< DataType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const T val
+ ) {
+ // pre-cast value to be copied
+ const DataType toCopy = static_cast< DataType >( val );
+
+ DataType * const raw = internal::getRaw( x );
+ const MaskType * const m_p = internal::getRaw( m );
+
+#ifdef _DEBUG
+ if( loop_over_vector_length ) {
+ std::cout << "\t using loop of size n (the vector length)\n";
+ } else {
+ std::cout << "\t using loop of size nz (the number of nonzeroes in the "
+ << "vector)\n";
+ }
+#endif
+
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_mask_nz = already_dense_mask
+ ? local_n
+ : local_mask.nonzeroes();
+
+ const size_t local_size_n = loop_over_vector_length
+ ? local_x.size()
+ : local_mask_nz;
+
+ for( size_t k = 0; k < local_size_n; ++k ) {
+
+ const size_t index = ( ( loop_over_vector_length || already_dense_mask )
+ ? k
+ : local_mask.index( k ) ) + lower_bound;
+ assert( index < internal::getCoordinates( x ).size() );
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( m ).template mask< descr >( index, m_p ) ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >(
+ index - lower_bound, m_p + lower_bound
+ ) ) {
+ continue;
+ }
+ }
+ if( !mask_is_dense ) {
+ (void) local_x.assign( index - lower_bound );
+ }
+ raw[ index ] = internal::ValueOrIndex<
+ descr, DataType, DataType
+ >::getFromScalar( toCopy, index );
+ }
+
+ return SUCCESS;
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType,
+ typename MaskType,
+ typename T,
+ typename Coords
+ >
+ RC set(
+ Vector< DataType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &m,
+ const T val,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value && !grb::is_object< T >::value,
+ void >::type * const = nullptr
+ ) {
+#ifdef _DEBUG
+ std::cout << "In grb::set (vector-to-value, masked)\n";
+#endif
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< DataType, T >::value ), "grb::set (Vector to scalar, masked)",
+ "called with a value type that does not match that of the given "
+ "vector"
+ );
+
+ // catch empty mask
+ if( size( m ) == 0 ) {
+ return set< descr >( x, val, phase );
+ }
+
+ // dynamic sanity checks
+ const size_t sizex = size( x );
+ if( sizex != size( m ) ) {
+ return MISMATCH;
+ }
+
+ // handle trivial resize
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask);
+
+ // then source is a pattern vector, just copy its pattern
+ internal::Pipeline::stage_type func = [&x, &m, val] (
+ internal::Pipeline &pipeline,
+ size_t lower_bound, size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage set(x, m, val) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ (void) pipeline;
+
+ Coords local_mask, local_x;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_x_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_mask = true;
+
+ const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ // for out-of-place operations with a mask and a scalar input, whether the
+ // output is dense or not depends on the mask
+ if( !mask_is_dense ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( dense_descr && local_x_nz < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( m ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound,
+ upper_bound );
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !mask_is_dense ) {
+ local_x.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( x ) );
+ }
+ }
+ }
+
+ const bool loop_over_vector_length = ( descr & descriptors::invert_mask ) ||
+ ( 4 * local_mask.nonzeroes() > 3 * local_mask.size() );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_set<
+#else
+ rc = internal::masked_set<
+#endif
+ descr, DataType, MaskType, T, Coords
+ >(
+ loop_over_vector_length,
+ already_dense_mask, mask_is_dense,
+ lower_bound, upper_bound,
+ local_x, local_mask, x, m, val
+ );
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::IO_SET_MASKED_SCALAR,
+ sizex, sizeof( DataType ),
+ dense_descr, dense_mask,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &m, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( m ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: set(x, m, val)" << std::endl;
+#endif
+ return ret;
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename DataType,
+ typename T,
+ typename Coords
+ >
+ RC setElement(
+ Vector< DataType, nonblocking, Coords > &x,
+ const T val,
+ const size_t i,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< DataType >::value &&
+ !grb::is_object< T >::value, void
+ >::type * const = nullptr
+ ) {
+ internal::le.execution( &x );
+
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< DataType, T >::value ),
+ "grb::set (Vector, at index)",
+ "called with a value type that does not match that of the given "
+ "vector"
+ );
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+ assert( phase == EXECUTE );
+
+ // dynamic sanity checks
+ if( i >= size( x ) ) {
+ return MISMATCH;
+ }
+ if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) {
+ return ILLEGAL;
+ }
+
+ // do set
+ (void)internal::getCoordinates( x ).assign( i );
+ internal::getRaw( x )[ i ] = static_cast< DataType >( val );
+
+#ifdef _DEBUG
+ std::cout << "setElement (nonblocking) set index " << i << " to value "
+ << internal::getRaw( x )[ i ] << "\n";
+#endif
+ return SUCCESS;
+ }
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+ bool sparse,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_vectors,
+ bool already_dense_input,
+#endif
+ typename OutputType,
+ typename InputType,
+ typename Coords
+ >
+ RC set_generic(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool already_dense_vectors,
+ bool already_dense_input,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y
+ ) {
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_y_nz = already_dense_input
+ ? local_n
+ : local_y.nonzeroes();
+
+ OutputType * __restrict__ const dst = internal::getRaw( x );
+ const InputType * __restrict__ const src = internal::getRaw( y );
+
+ if( sparse ) {
+ if( src == nullptr && dst == nullptr ) {
+ for( size_t i = 0; i < local_y_nz; ++i ) {
+ const size_t index = ( already_dense_input ) ? i : local_y.index( i );
+ if( !already_dense_vectors ) {
+ (void) local_x.assign( index );
+ }
+ }
+ } else {
+#ifndef NDEBUG
+ if( src == nullptr ) {
+ assert( dst == nullptr );
+ }
+#endif
+ for( size_t i = 0; i < local_y_nz; ++i ) {
+ const size_t index = ( ( already_dense_input )
+ ? i
+ : local_y.index( i ) ) + lower_bound;
+ if( !already_dense_vectors ) {
+ (void) local_x.assign( index - lower_bound );
+ }
+ if( !out_is_void && !in_is_void ) {
+ dst[ index ] = internal::setIndexOrValue< descr, OutputType >( index,
+ src[ index ] );
+ }
+ }
+ }
+ } else {
+ if( !( src == nullptr && dst == nullptr ) ) {
+#ifndef NDEBUG
+ if( src == nullptr ) {
+ assert( dst == nullptr );
+ }
+#endif
+ for( size_t i = lower_bound; i < upper_bound; ++i ) {
+ if( !out_is_void && !in_is_void ) {
+ dst[ i ] = src[ i ];
+ }
+ }
+ }
+ }
+
+ return SUCCESS;
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType,
+ typename Coords
+ >
+ RC set(
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Phase &phase = EXECUTE
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, InputType >::value ),
+ "grb::copy (Vector)",
+ "called with vector parameters whose element data types do not match"
+ );
+ constexpr bool out_is_void = std::is_void< OutputType >::value;
+ constexpr bool in_is_void = std::is_void< OutputType >::value;
+ static_assert( !in_is_void || out_is_void,
+ "grb::set (nonblocking, vector <- vector, masked): "
+ "if input is void, then the output must be also" );
+ static_assert( !(descr & descriptors::use_index) || !out_is_void,
+ "grb::set (nonblocking, vector <- vector, masked): "
+ "use_index descriptor cannot be set if output vector is void" );
+
+ //get length
+ const size_t n = internal::getCoordinates( y ).size();
+ // check contract
+ if( n != size( x ) ) {
+ return MISMATCH;
+ }
+ if( n == 0 ) {
+ return SUCCESS;
+ }
+ if( getID( x ) == getID( y ) ) {
+ return ILLEGAL;
+ }
+
+ // on resize
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // on execute
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr bool dense_descr = descr & descriptors::dense;
+
+ internal::Pipeline::stage_type func = [&x, &y] (
+ internal::Pipeline &pipeline,
+ size_t lower_bound, size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage set(x, y) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_y_nz = local_n;
+ bool sparse = false;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ (void) pipeline;
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_input = true;
+
+ if( !already_dense_vectors ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_input = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input ) {
+#else
+ already_dense_input = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+ if( local_y_nz < local_n ) {
+ sparse = true;
+ }
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !already_dense_vectors ) {
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( x ).reset_global_nnz_counter();
+ }
+ }
+
+ if( sparse ) {
+ // this primitive is out-of-place, thus make the output empty
+ if( !already_dense_vectors ) {
+ local_x.local_clear();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_set_generic<
+#else
+ rc = internal::set_generic<
+#endif
+ descr, out_is_void, in_is_void, true
+ >(
+ already_dense_vectors, already_dense_input,
+ lower_bound, upper_bound,
+ local_x, local_y, x, y
+ );
+ } else {
+ if( !already_dense_vectors ) {
+ local_x.local_assignAll();
+ }
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_set_generic<
+#else
+ rc = internal::set_generic<
+#endif
+ descr, out_is_void, in_is_void, false
+ >(
+ already_dense_vectors, already_dense_input,
+ lower_bound, upper_bound,
+ local_x, local_y, x, y
+ );
+ }
+
+ if( !already_dense_vectors ) {
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::IO_SET_VECTOR,
+ n, sizeof( OutputType ), dense_descr, true,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &y, nullptr, nullptr, nullptr,
+ &internal::getCoordinates( y ), nullptr, nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: set(x, y)" << std::endl;
+#endif
+ return ret;
+ }
+
+ namespace internal {
+
+ template<
+ Descriptor descr,
+ bool out_is_void,
+ bool in_is_void,
+#ifdef GRB_BOOLEAN_DISPATCHER
+ bool loop_over_y,
+ bool already_dense_input_y,
+ bool already_dense_mask,
+ bool mask_is_dense,
+#endif
+ typename OutputType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC masked_set(
+#ifndef GRB_BOOLEAN_DISPATCHER
+ bool loop_over_y,
+ bool already_dense_input_y,
+ bool already_dense_mask,
+ bool mask_is_dense,
+#endif
+ const size_t lower_bound,
+ const size_t upper_bound,
+ Coords &local_x,
+ const Coords &local_mask,
+ const Coords &local_y,
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType, nonblocking, Coords > &y
+ ) {
+ const size_t local_n = upper_bound - lower_bound;
+ const size_t local_y_nz = already_dense_input_y
+ ? local_n
+ : local_y.nonzeroes();
+ const size_t local_mask_nz = already_dense_mask
+ ? local_n
+ : local_mask.nonzeroes();
+
+ const size_t n = loop_over_y ? local_y_nz : local_mask_nz;
+
+ for( size_t k = 0; k < n; ++k ) {
+ const size_t i = ( loop_over_y
+ ? ( already_dense_input_y ? k : local_y.index( k ) )
+ : ( already_dense_mask ? k : local_mask.index( k ) )
+ ) + lower_bound;
+ if( already_dense_mask ) {
+ if( !internal::getCoordinates( mask ).template mask< descr >(
+ i, internal::getRaw( mask )
+ ) ) {
+ continue;
+ }
+ } else {
+ if( !local_mask.template mask< descr >(
+ i - lower_bound, internal::getRaw( mask ) + lower_bound
+ ) ) {
+ continue;
+ }
+ }
+ if( loop_over_y || already_dense_input_y ||
+ local_y.assigned( i - lower_bound )
+ ) {
+ if( !out_is_void && !in_is_void ) {
+ if( !mask_is_dense ) {
+ (void) local_x.assign( i - lower_bound );
+ }
+ internal::getRaw( x )[ i ] = internal::ValueOrIndex<
+ descr, OutputType, InputType
+ >::getFromArray(
+ internal::getRaw( y ),
+ [] (const size_t i) {return i;},
+ i
+ );
+ }
+ }
+ }
+
+ return SUCCESS;
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename MaskType,
+ typename InputType,
+ typename Coords
+ >
+ RC set(
+ Vector< OutputType, nonblocking, Coords > &x,
+ const Vector< MaskType, nonblocking, Coords > &mask,
+ const Vector< InputType, nonblocking, Coords > &y,
+ const Phase &phase = EXECUTE,
+ const typename std::enable_if<
+ !grb::is_object< OutputType >::value &&
+ !grb::is_object< MaskType >::value &&
+ !grb::is_object< InputType >::value,
+ void >::type * const = nullptr
+ ) {
+ // static sanity checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< OutputType, InputType >::value ),
+ "grb::set (Vector)",
+ "called with vector parameters whose element data types do not match" );
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< MaskType, bool >::value ),
+ "grb::set (Vector)",
+ "called with non-bool mask element types" );
+ constexpr bool out_is_void = std::is_void< OutputType >::value;
+ constexpr bool in_is_void = std::is_void< OutputType >::value;
+ static_assert( !in_is_void || out_is_void,
+ "grb::set (nonblocking, vector <- vector, masked): "
+ "if input is void, then the output must be also" );
+ static_assert( !(descr & descriptors::use_index) || !out_is_void,
+ "grb::set (nonblocking, vector <- vector, masked): "
+ "use_index descriptor cannot be set if output vector is void" );
+
+ // catch contract violations
+ const size_t size = grb::size( y );
+ if( size != grb::size( x ) ) {
+ return MISMATCH;
+ }
+ if( size == 0 ) {
+ return SUCCESS;
+ }
+ if( getID( x ) == getID( y ) ) {
+ return ILLEGAL;
+ }
+
+ // delegate if possible
+ if( grb::size( mask ) == 0 ) {
+ return set( x, y );
+ }
+
+ // additional contract check
+ if( size != grb::size( mask ) ) {
+ return MISMATCH;
+ }
+
+ // on resize
+ if( phase == RESIZE ) {
+ return SUCCESS;
+ }
+
+ // on execute
+ assert( phase == EXECUTE );
+
+ RC ret = SUCCESS;
+
+ constexpr const bool dense_descr = descr & descriptors::dense;
+ constexpr const bool dense_mask = dense_descr &&
+ (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask);
+
+ internal::Pipeline::stage_type func = [&x, &mask, &y] (
+ internal::Pipeline &pipeline,
+ size_t lower_bound, size_t upper_bound
+ ) {
+#ifdef _NONBLOCKING_DEBUG
+ #pragma omp critical
+ std::cout << "\t\tExecution of stage set(x, mask, y) in the range("
+ << lower_bound << ", " << upper_bound << ")" << std::endl;
+#endif
+ RC rc = SUCCESS;
+
+ Coords local_mask, local_x, local_y;
+ const size_t local_n = upper_bound - lower_bound;
+ size_t local_mask_nz = local_n;
+ size_t local_x_nz = local_n;
+ size_t local_y_nz = local_n;
+
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ const bool already_dense_vectors = dense_descr ||
+ pipeline.allAlreadyDenseVectors();
+#else
+ constexpr const bool already_dense_vectors = dense_descr;
+#endif
+
+ bool already_dense_mask = true;
+ bool already_dense_input_y = true;
+
+ // make the vector empty unless the dense descriptor is provided
+ constexpr const bool mask_is_dense = (descr & descriptors::structural) &&
+ !(descr & descriptors::invert_mask) && already_dense_vectors;
+
+ if( !mask_is_dense ) {
+ local_x = internal::getCoordinates( x ).asyncSubset( lower_bound,
+ upper_bound );
+ local_x_nz = local_x.nonzeroes();
+ if( dense_descr && local_x_nz < local_n ) {
+ return ILLEGAL;
+ }
+ }
+
+ if( !already_dense_vectors ) {
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ already_dense_mask = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( mask ) );
+ if( !already_dense_mask ) {
+#else
+ already_dense_mask = false;
+#endif
+ local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound,
+ upper_bound );
+ local_mask_nz = local_mask.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+
+ already_dense_input_y = pipeline.containsAlreadyDenseVector(
+ &internal::getCoordinates( y ) );
+ if( !already_dense_input_y ) {
+#else
+ already_dense_input_y = false;
+#endif
+ local_y = internal::getCoordinates( y ).asyncSubset( lower_bound,
+ upper_bound );
+ local_y_nz = local_y.nonzeroes();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ }
+#endif
+ }
+
+ if( !mask_is_dense ) {
+ local_x.local_clear();
+ if( lower_bound == 0 ) {
+ internal::getCoordinates( x ).reset_global_nnz_counter();
+#ifdef GRB_ALREADY_DENSE_OPTIMIZATION
+ pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) );
+#endif
+ if( dense_descr ) {
+ pipeline.markMaybeSparseDenseDescriptorVerification(
+ &internal::getCoordinates( x ) );
+ }
+ }
+ }
+
+ // choose optimal loop size
+ const bool loop_over_y = (descr & descriptors::invert_mask) ||
+ ( local_y_nz < local_mask_nz );
+
+#ifdef GRB_BOOLEAN_DISPATCHER
+ rc = internal::boolean_dispatcher_masked_set<
+#else
+ rc = internal::masked_set<
+#endif
+ descr, out_is_void, in_is_void
+ >(
+ loop_over_y,
+ already_dense_input_y, already_dense_mask, mask_is_dense,
+ lower_bound, upper_bound,
+ local_x, local_mask, local_y,
+ x, mask, y
+ );
+
+ if( !mask_is_dense ) {
+ internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound,
+ upper_bound );
+ }
+
+ return rc;
+ };
+
+ ret = ret ? ret : internal::le.addStage(
+ std::move( func ),
+ internal::Opcode::IO_SET_MASKED_VECTOR,
+ size, sizeof( OutputType ), dense_descr, dense_mask,
+ &x, nullptr, &internal::getCoordinates( x ), nullptr,
+ &mask, &y, nullptr, nullptr,
+ &internal::getCoordinates( mask ), &internal::getCoordinates( y ),
+ nullptr, nullptr,
+ nullptr
+ );
+
+#ifdef _NONBLOCKING_DEBUG
+ std::cout << "\t\tStage added to a pipeline: set(x, mask, y)" << std::endl;
+#endif
+ return ret;
+ }
+
+ namespace internal {
+
+ template<
+ bool A_is_mask,
+ Descriptor descr,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2 = const OutputType
+ >
+ RC set(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType1, nonblocking > &A,
+ const InputType2 * __restrict__ id = nullptr
+ ) noexcept {
+ if( internal::NONBLOCKING::warn_if_not_native &&
+ config::PIPELINE::warn_if_not_native
+ ) {
+ std::cerr << "Warning: set (matrix copy, nonblocking) currently delegates "
+ << "to a blocking implementation.\n"
+ << " Further similar such warnings will be suppressed.\n";
+ internal::NONBLOCKING::warn_if_not_native = false;
+ }
+
+ // nonblocking execution is not supported
+ // first, execute any computation that is not completed
+ grb::internal::le.execution();
+
+ // second, delegate to the reference backend
+ return set< A_is_mask, descr, OutputType, InputType1, InputType2 >(
+ internal::getRefMatrix( C ), internal::getRefMatrix( A ), id );
+ }
+
+ } // end namespace internal::grb
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType
+ >
+ RC set(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType, nonblocking > &A,
+ const Phase &phase = EXECUTE
+ ) noexcept {
+ static_assert( std::is_same< OutputType, void >::value ||
+ !std::is_same< InputType, void >::value,
+ "grb::set cannot interpret an input pattern matrix without a "
+ "semiring or a monoid. This interpretation is needed for "
+ "writing the non-pattern matrix output. Possible solutions: 1) "
+ "use a (monoid-based) foldl / foldr, 2) use a masked set, or "
+ "3) change the output of grb::set to a pattern matrix also." );
+#ifdef _DEBUG
+ std::cout << "Called grb::set (matrix-to-matrix, nonblocking)" << std::endl;
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType, OutputType >::value
+ ), "grb::set",
+ "called with non-matching value types" );
+
+ // dynamic checks
+ assert( phase != TRY );
+
+ // delegate
+ if( phase == RESIZE ) {
+ return resize( C, nnz( A ) );
+ } else {
+ assert( phase == EXECUTE );
+ return internal::set< false, descr >( C, A );
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename OutputType,
+ typename InputType1,
+ typename InputType2
+ >
+ RC set(
+ Matrix< OutputType, nonblocking > &C,
+ const Matrix< InputType1, nonblocking > &A,
+ const InputType2 &val,
+ const Phase &phase = EXECUTE
+ ) noexcept {
+ static_assert( !std::is_same< OutputType, void >::value,
+ "internal::grb::set (masked set to value): cannot have a pattern "
+ "matrix as output" );
+#ifdef _DEBUG
+ std::cout << "Called grb::set (matrix-to-value-masked, nonblocking)\n";
+#endif
+ // static checks
+ NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) ||
+ std::is_same< InputType2, OutputType >::value
+ ), "grb::set",
+ "called with non-matching value types"
+ );
+
+ // dynamic checks
+ assert( phase != TRY );
+
+ // delegate
+ if( phase == RESIZE ) {
+ return resize( C, nnz( A ) );
+ } else {
+ assert( phase == EXECUTE );
+ if( std::is_same< OutputType, void >::value ) {
+ return internal::set< false, descr >( C, A );
+ } else {
+ return internal::set< true, descr >( C, A, &val );
+ }
+ }
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType,
+ typename fwd_iterator,
+ typename Coords,
+ class Dup = operators::right_assign< InputType >
+ >
+ RC buildVector(
+ Vector< InputType, nonblocking, Coords > &x,
+ fwd_iterator start,
+ const fwd_iterator end,
+ const IOMode mode,
+ const Dup &dup = Dup()
+ ) {
+ return buildVector< descr, InputType, fwd_iterator, Coords, Dup >(
+ internal::getRefVector( x ), start, end, mode, dup );
+ }
+
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType,
+ typename fwd_iterator1,
+ typename fwd_iterator2,
+ typename Coords,
+ class Dup = operators::right_assign< InputType >
+ >
+ RC buildVector(
+ Vector< InputType, nonblocking, Coords > &x,
+ fwd_iterator1 ind_start,
+ const fwd_iterator1 ind_end,
+ fwd_iterator2 val_start,
+ const fwd_iterator2 val_end,
+ const IOMode mode,
+ const Dup &dup = Dup()
+ ) {
+ internal::le.execution( &x );
+ return buildVector<
+ descr, InputType, fwd_iterator1, fwd_iterator2, Coords, Dup
+ >(
+ internal::getRefVector( x ), ind_start, ind_end, val_start, val_end,
+ mode, dup
+ );
+ }
+
+ /** buildMatrixUnique is based on that of the reference backend */
+ template<
+ Descriptor descr = descriptors::no_operation,
+ typename InputType,
+ typename RIT,
+ typename CIT,
+ typename NIT,
+ typename fwd_iterator
+ >
+ RC buildMatrixUnique(
+ Matrix< InputType, nonblocking, RIT, CIT, NIT > &A,
+ fwd_iterator start,
+ const fwd_iterator end,
+ const IOMode mode
+ ) {
+ return buildMatrixUnique<
+ descr, InputType, RIT, CIT, NIT, fwd_iterator
+ >( internal::getRefMatrix(A), start, end, mode );
+ }
+
+ template<
+ typename InputType,
+ typename Coords
+ >
+ uintptr_t getID( const Vector< InputType, nonblocking, Coords > &x ) {
+ return getID( internal::getRefVector( x ) );
+ }
+
+ template<>
+ RC wait< nonblocking >();
+
+ /** \internal Dispatch to base wait implementation */
+ template<
+ typename InputType,
+ typename Coords,
+ typename ... Args
+ >
+ RC wait(
+ const Vector< InputType, nonblocking, Coords > &x,
+ const Args &... args
+ ) {
+ RC ret = internal::le.execution( &x );
+ if( ret != SUCCESS ) {
+ return ret;
+ }
+ return wait( args... );
+ }
+
+ template<
+ typename InputType,
+ typename Coords
+ >
+ RC wait( const Vector< InputType, nonblocking, Coords > &x ) {
+ return internal::le.execution( &x );
+ }
+
+ /** \internal Dispatch to base wait implementation */
+ template<
+ typename InputType,
+ typename... Args
+ >
+ RC wait(
+ const Matrix< InputType, nonblocking > &A,
+ const Args &... args
+ ) {
+ (void) A;
+ //TODO: currently, matrices are read only and no action is required
+ // once the level-3 primitives are implemented
+ // the pipeline should be executed like for vectors
+ return wait( args... );
+ }
+
+ template< typename InputType >
+ RC wait( const Matrix< InputType, nonblocking > &A ) {
+ (void) A;
+ //TODO: currently, matrices are read only and no action is required
+ // once the level-3 primitives are implemented
+ // the pipeline should be executed like for vectors
+ //return wait( args... );
+ return SUCCESS;
+ }
+
+ /** @} */
+
+} // namespace grb
+
+#undef NO_CAST_ASSERT
+
+#endif // end ``_H_GRB_NONBLOCKING_IO
+
diff --git a/include/graphblas/nonblocking/lazy_evaluation.hpp b/include/graphblas/nonblocking/lazy_evaluation.hpp
new file mode 100644
index 000000000..426f530fb
--- /dev/null
+++ b/include/graphblas/nonblocking/lazy_evaluation.hpp
@@ -0,0 +1,178 @@
+
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Supporting constructs for lazy evaluation.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_LAZY_EVALUATION
+#define _H_GRB_NONBLOCKING_LAZY_EVALUATION
+
+#include
+
+#include "coordinates.hpp"
+#include "pipeline.hpp"
+
+
+namespace grb {
+
+ namespace internal {
+
+ /**
+ * Stores ALP primitives as stages in a set of pipelines maintained by this
+ * class.
+ */
+ class LazyEvaluation {
+
+ private:
+
+ /** Multiple pipelines may be maintained at any time. */
+ std::vector< Pipeline > pipelines;
+
+ /** Stores the pipelines that share data with the new stage. */
+ std::vector< std::vector< Pipeline >::iterator > shared_data_pipelines;
+
+ /**
+ * Makes sure any warnings related to exceeding the initial number of
+ * pipelines are printed only once.
+ */
+ bool warn_if_exceeded;
+
+ /**
+ * Checks if the number of pipelines has been exceeded past the initial
+ * number of pipelines.
+ *
+ * The initial number is configurable via the following configuration
+ * field: #grb::config::PIPELINE::max_pipelines.
+ */
+ void checkIfExceeded() noexcept;
+
+
+ public:
+
+ /** Default constructor. */
+ LazyEvaluation();
+
+ /**
+ * Adds a stage to an automatically determined pipeline.
+ *
+ * The following parameters are mandatory:
+ *
+ * @param[in] func The function to be added.
+ * @param[in] opcode The corresponding opcode.
+ * @param[in] n The pipeline size.
+ * @param[in] data_type_size The output byte size.
+ * @param[in] dense_descr Whether the op is dense.
+ * @param[in] dense_mask Whether the mask is dense.
+ *
+ * The following parameters are optional and could be nullptr if
+ * not required:
+ *
+ * @param[out] output_container_ptr Pointer to the output container.
+ * @param[out] output_aux_container_ptr Pointer to another output.
+ * @param[out] coor_output_ptr Pointer to the coordinates that
+ * correspond to
+ * \a output_container_ptr
+ * @param[out] coor_output_aux_ptr Pointer to the coordinates that
+ * correspond to
+ * \a output_aux_container_ptr
+ * @param[in] input_a_ptr Pointer to a first input container.
+ * @param[in] input_b_ptr Pointer to a second such container.
+ * @param[in] input_c_ptr Pointer to a third such container.
+ * @param[in] input_d_ptr Pointer to a fourth such container.
+ * @param[in] coor_a_ptr Pointer to coordinates that
+ * correspond to \a input_a_ptr.
+ * @param[in] coor_b_ptr Pointer to coordinates that
+ * correspond to \a input_b_ptr.
+ * @param[in] coor_c_ptr Pointer to coordinates that
+ * correspond to \a input_c_ptr.
+ * @param[in] coor_d_ptr Pointer to coordinates that
+ * correspond to \a input_d_ptr.
+ * @param[in] input_matrix Pointer to an input matrix.
+ */
+ RC addStage(
+ const Pipeline::stage_type &&func,
+ const Opcode opcode,
+ const size_t n,
+ const size_t data_type_size,
+ const bool dense_descr,
+ const bool dense_mask,
+ void * const output_container_ptr,
+ void * const output_aux_container_ptr,
+ Coordinates< nonblocking > * const coor_output_ptr,
+ Coordinates< nonblocking > * const coor_output_aux_ptr,
+ const void * const input_a_ptr,
+ const void * const input_b_ptr,
+ const void * const input_c_ptr,
+ const void * const input_d_ptr,
+ const Coordinates< nonblocking > * const coor_a_ptr,
+ const Coordinates< nonblocking > * const coor_b_ptr,
+ const Coordinates< nonblocking > * const coor_c_ptr,
+ const Coordinates< nonblocking > * const coor_d_ptr,
+ const void * const input_matrix
+ );
+
+ /**
+ * Adds an eWiseLambda stage to an automatically-determined pipeline.
+ *
+ * The following parameters are mandatory:
+ *
+ * @param[in] func The function to be added.
+ * @param[in] opcode The corresponding opcode.
+ * @param[in] n The pipeline size.
+ * @param[in] data_type_size The output byte size.
+ * @param[in] dense_descr Whether the op is dense.
+ * @param[in] all_containers_ptr A container of all ALP containers that the
+ * \a func reads \em or writes
+ * @param[in] coor_a_ptr A container of all coordinates that
+ * correspond to those in
+ * \a all_containers_ptr
+ */
+ RC addeWiseLambdaStage(
+ const Pipeline::stage_type &&func,
+ const Opcode opcode,
+ const size_t n,
+ const size_t data_type_size,
+ const bool dense_descr,
+ std::vector< const void * > all_containers_ptr,
+ const Coordinates< nonblocking > * const coor_a_ptr
+ );
+
+ /**
+ * Executes the pipeline necessary to generate the output of the given
+ * \a container.
+ */
+ RC execution( const void *container );
+
+ /**
+ * Executes all pipelines.
+ */
+ RC execution();
+
+ }; // end class LazyEvaluation
+
+ } // end namespace internal
+
+} // end namespace grb
+
+#endif //end `_H_GRB_NONBLOCKING_LAZY_EVALUATION'
+
diff --git a/include/graphblas/nonblocking/matrix.hpp b/include/graphblas/nonblocking/matrix.hpp
new file mode 100644
index 000000000..b13a8c2be
--- /dev/null
+++ b/include/graphblas/nonblocking/matrix.hpp
@@ -0,0 +1,595 @@
+/*
+ * Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file
+ *
+ * Provides the nonblocking matrix container.
+ *
+ * @author Aristeidis Mastoras
+ * @date 16th of May, 2022
+ */
+
+#ifndef _H_GRB_NONBLOCKING_MATRIX
+#define _H_GRB_NONBLOCKING_MATRIX
+
+#include //std::stringstream
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include