Backends:
- CPU: aligned allocated memory
- GPU: CUDA
Algorithms:
- BLAS (partial)
- CPU: handmade, Eigen, cblas
- GPU: cuBLAS
- Parallism:
- CPU: openmp, sequential
- GPU: cuda, thrust
- python bindings
shape_t<keep_dim, 3, 2> shape(4, 3, 2);
int p[24];
for (int i = 0; i < 24; ++i) {
p[i] = i + 1;
}
auto view = make_view<device::cpu>(p, shape);
auto value0 = view(0, 0, 0);
auto buf = make_buffer<int>(4, 3, 2);
auto view = buf.view();
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 2; ++k) {
buf.data()[i * 6 + j * 2 + k] = i * 6 + j * 2 + k + 1;
EXPECT_EQ(view(i, j, k), i * 6 + j * 2 + k + 1);
}
}
}
for (const auto &vi : view) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 2; ++k) {
EXPECT_EQ(view(0, j, k), j * 2 + k + 1);
}
}
break;
}
__global__ void set_value(float *ptr, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
ptr[idx] = static_cast<float>(idx);
}
}
// Create buffer on CUDA.
auto buf = make_buffer<float, device::cuda>(shape_t<-1, 4>(10, 4));
auto view = buf.view();
set_value<<<view.size(), 1>>>(buf.data(), buf.size());
// Parallism
par::cuda parfor;
parfor.run(view.shape(), [view]__device__(auto idx) {
auto [i, j] = idx;
printf("Lambda view[%d, %d] = %f\n", i, j, view(i, j));
});
parfor.run(dshape<4>(10, 4, 1, 1), [view] __device__ (auto idx) {
auto [i, j, k, l] = idx;
printf("Lambda view[%d, %d, %d, %d] = %f\n", i, j, k, l, view(i, j));
});