diff --git a/CHANGELOG.md b/CHANGELOG.md index dcbe0bba5..60cc6c5ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h ## (Unreleased) rocSOLVER ### Added + +* Hybrid computation support for existing routines: + - STERF + ### Changed ### Removed ### Optimized @@ -22,7 +26,7 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h * Algorithm selection mechanism for hybrid computation * Hybrid computation support for existing routines: - BDSQR - - GESVD + - GESVD ### Optimized diff --git a/clients/common/auxiliary/testing_sterf.hpp b/clients/common/auxiliary/testing_sterf.hpp index 7585337f5..bc4e35ff4 100644 --- a/clients/common/auxiliary/testing_sterf.hpp +++ b/clients/common/auxiliary/testing_sterf.hpp @@ -213,6 +213,19 @@ void testing_sterf(Arguments& argus) rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + { + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid), + rocblas_status_success); + + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values // N/A diff --git a/clients/common/lapack/testing_syev_heev.hpp b/clients/common/lapack/testing_syev_heev.hpp index 00928585f..e50af29b4 100644 --- a/clients/common/lapack/testing_syev_heev.hpp +++ b/clients/common/lapack/testing_syev_heev.hpp @@ -398,6 +398,19 @@ void testing_syev_heev(Arguments& argus) rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + { + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid), + rocblas_status_success); + + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { diff --git a/clients/gtest/auxiliary/sterf_gtest.cpp b/clients/gtest/auxiliary/sterf_gtest.cpp index 9ad0cc949..b266f8727 100644 --- a/clients/gtest/auxiliary/sterf_gtest.cpp +++ b/clients/gtest/auxiliary/sterf_gtest.cpp @@ -65,7 +65,8 @@ Arguments sterf_setup_arguments(sterf_tuple tup) return arg; } -class STERF : public ::TestWithParam +template +class STERF_BASE : public ::TestWithParam { protected: void TearDown() override @@ -77,6 +78,7 @@ class STERF : public ::TestWithParam void run_tests() { Arguments arg = sterf_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("n") == 0) testing_sterf_bad_arg(); @@ -85,6 +87,14 @@ class STERF : public ::TestWithParam } }; +class STERF : public STERF_BASE<0> +{ +}; + +class STERF_HYBRID : public STERF_BASE<1> +{ +}; + // non-batch tests TEST_P(STERF, __float) @@ -97,6 +107,20 @@ TEST_P(STERF, __double) run_tests(); } +TEST_P(STERF_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(STERF_HYBRID, __double) +{ + run_tests(); +} + INSTANTIATE_TEST_SUITE_P(daily_lapack, STERF, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STERF, ValuesIn(matrix_size_range)); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, STERF_HYBRID, ValuesIn(large_matrix_size_range)); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, STERF_HYBRID, ValuesIn(matrix_size_range)); diff --git a/clients/gtest/lapack/syev_heev_gtest.cpp b/clients/gtest/lapack/syev_heev_gtest.cpp index 82779d976..f5f5c7829 100644 --- a/clients/gtest/lapack/syev_heev_gtest.cpp +++ b/clients/gtest/lapack/syev_heev_gtest.cpp @@ -81,6 +81,7 @@ Arguments syev_heev_setup_arguments(syev_heev_tuple tup) return arg; } +template class SYEV_HEEV : public ::TestWithParam { protected: @@ -93,6 +94,7 @@ class SYEV_HEEV : public ::TestWithParam void run_tests() { Arguments arg = syev_heev_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("uplo") == 'L') @@ -103,11 +105,19 @@ class SYEV_HEEV : public ::TestWithParam } }; -class SYEV : public SYEV_HEEV +class SYEV : public SYEV_HEEV<0> { }; -class HEEV : public SYEV_HEEV +class HEEV : public SYEV_HEEV<0> +{ +}; + +class SYEV_HYBRID : public SYEV_HEEV<1> +{ +}; + +class HEEV_HYBRID : public SYEV_HEEV<1> { }; @@ -133,6 +143,26 @@ TEST_P(HEEV, __double_complex) run_tests(); } +TEST_P(SYEV_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(SYEV_HYBRID, __double) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, __float_complex) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, __double_complex) +{ + run_tests(); +} + // batched tests TEST_P(SYEV, batched__float) @@ -155,6 +185,26 @@ TEST_P(HEEV, batched__double_complex) run_tests(); } +TEST_P(SYEV_HYBRID, batched__float) +{ + run_tests(); +} + +TEST_P(SYEV_HYBRID, batched__double) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, batched__double_complex) +{ + run_tests(); +} + // strided_batched tests TEST_P(SYEV, strided_batched__float) @@ -177,13 +227,49 @@ TEST_P(HEEV, strided_batched__double_complex) run_tests(); } +TEST_P(SYEV_HYBRID, strided_batched__float) +{ + run_tests(); +} + +TEST_P(SYEV_HYBRID, strided_batched__double) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, strided_batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEEV_HYBRID, strided_batched__double_complex) +{ + run_tests(); +} + // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); +INSTANTIATE_TEST_SUITE_P(daily_lapack, + SYEV_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(op_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + HEEV_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(op_range))); + // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEV, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEV, Combine(ValuesIn(size_range), ValuesIn(op_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + SYEV_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(op_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + HEEV_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(op_range))); diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h index 3335f8f3d..fd6e1ec82 100644 --- a/library/include/rocsolver/rocsolver-extra-types.h +++ b/library/include/rocsolver/rocsolver-extra-types.h @@ -188,6 +188,7 @@ typedef enum rocsolver_function_ { rocsolver_function_bdsqr = 401, rocsolver_function_gesvd = 402, + rocsolver_function_sterf = 403, } rocsolver_function; #endif /* ROCSOLVER_EXTRA_TYPES_H */ diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index bd9a0592d..12c6ed69e 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -4005,6 +4005,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zbdsqr(rocblas_handle handle, The matrix is not represented explicitly, but rather as the array of diagonal elements D and the array of symmetric off-diagonal elements E. + \note + A hybrid (CPU+GPU) approach is available for STERF, primarily intended for + homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it. + @param[in] handle rocblas_handle. @param[in] diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 394aa2339..21e56b410 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -35,6 +35,7 @@ #include "lapack_host_functions.hpp" #include "rocauxiliary_lasr.hpp" +#include "rocsolver_hybrid_array.hpp" ROCSOLVER_BEGIN_NAMESPACE @@ -1341,15 +1342,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, const rocblas_stride strideD, S* E, const rocblas_stride strideE, - W1 V_arg, + W1 V, const I shiftV, const I ldv, const rocblas_stride strideV, - W2 U_arg, + W2 U, const I shiftU, const I ldu, const rocblas_stride strideU, - W3 C_arg, + W3 C, const I shiftC, const I ldc, const rocblas_stride strideC, @@ -1358,203 +1359,48 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I* splits_map, S* work) { - // ------------------------------- - // lambda expression as helper - // ------------------------------- - auto is_device_pointer = [](void* ptr) -> bool { - hipPointerAttribute_t dev_attributes; - if(ptr == nullptr) - { - return (false); - } - - auto istat = hipPointerGetAttributes(&dev_attributes, ptr); - if(istat != hipSuccess) - { - std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat) - << std::endl; - } - assert(istat == hipSuccess); - return (dev_attributes.type == hipMemoryTypeDevice); - }; - - // ------------------------- - // copy D into hD, E into hE - // ------------------------- hipStream_t stream; rocblas_get_stream(handle, &stream); - W1 V = V_arg; - W2 U = U_arg; - W3 C = C_arg; - - // handle batch case with array of pointers on device - std::vector Vp_array(batch_count); - std::vector Up_array(batch_count); - std::vector Cp_array(batch_count); - + // ----------------------------------- + // transfer arrays from device to host + // ----------------------------------- + rocsolver_hybrid_array hD; + rocsolver_hybrid_array hE; + rocsolver_hybrid_array hV; + rocsolver_hybrid_array hU; + rocsolver_hybrid_array hC; + rocsolver_hybrid_array hInfo; + + ROCBLAS_CHECK(hD.init_async(n, D, strideD, batch_count, stream)); + ROCBLAS_CHECK(hE.init_async(n - 1, E, strideE, batch_count, stream)); if(nv > 0) - { - bool const is_device_V_arg = is_device_pointer((void*)V_arg); - if(is_device_V_arg) - { - // note "T *" and "T * const" may be considered different types - bool constexpr is_array_of_device_pointers - = !(std::is_same::value || std::is_same::value); - bool constexpr need_copy_W1 = is_array_of_device_pointers; - if constexpr(need_copy_W1) - { - size_t const nbytes = sizeof(T*) * batch_count; - void* const dst = (void*)&(Vp_array[0]); - void* const src = (void*)V_arg; - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - V = &(Vp_array[0]); - } - } - } - + ROCBLAS_CHECK(hV.init_pointers_only(V + shiftV, strideV, batch_count, stream)); if(nu > 0) - { - bool const is_device_U_arg = is_device_pointer((void*)U_arg); - if(is_device_U_arg) - { - bool constexpr is_array_of_device_pointers - = !(std::is_same::value || std::is_same::value); - bool constexpr need_copy_W2 = is_array_of_device_pointers; - if constexpr(need_copy_W2) - { - size_t const nbytes = sizeof(T*) * batch_count; - void* const dst = (void*)&(Up_array[0]); - void* const src = (void*)U_arg; - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - U = &(Up_array[0]); - } - } - } - + ROCBLAS_CHECK(hU.init_pointers_only(U + shiftU, strideU, batch_count, stream)); if(nc > 0) - { - bool const is_device_C_arg = is_device_pointer((void*)C_arg); - if(is_device_C_arg) - { - bool constexpr is_array_of_device_pointers - = !(std::is_same::value || std::is_same::value); - bool constexpr need_copy_W3 = is_array_of_device_pointers; - if constexpr(need_copy_W3) - { - size_t const nbytes = sizeof(T*) * batch_count; - void* const dst = (void*)&(Cp_array[0]); - void* const src = (void*)C_arg; - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - C = &(Cp_array[0]); - } - } - } - - S* hD = nullptr; - S* hE = nullptr; - - size_t const E_size = (n - 1); - HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n)); - HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size)); - - // ---------------------------------------------------- - // copy info_array[] on device to linfo_array[] on host - // ---------------------------------------------------- - I* linfo_array = nullptr; - HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count))); - { - void* const dst = &(linfo_array[0]); - void* const src = &(info_array[0]); - size_t const nbytes = sizeof(I) * batch_count; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } + ROCBLAS_CHECK(hC.init_pointers_only(C + shiftC, strideC, batch_count, stream)); + ROCBLAS_CHECK(hInfo.init_async(1, info_array, 1, batch_count, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); S* hwork = nullptr; HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n))); + S* dwork = nullptr; + HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n))); - // ------------------------------------------------- - // transfer arrays D(:) and E(:) from Device to Host - // ------------------------------------------------- - bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n); - if(use_single_copy_for_D) - { - void* const dst = (void*)&(hD[0]); - void* const src = (void*)&(D[0]); - size_t const sizeBytes = sizeof(S) * n * batch_count; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - else - { - for(I bid = 0; bid < batch_count; bid++) - { - void* const dst = (void*)&(hD[bid * n]); - void* const src = (void*)&(D[bid * strideD]); - size_t const sizeBytes = sizeof(S) * n; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - } - - { - for(I bid = 0; bid < batch_count; bid++) - { - void* const dst = (void*)&(hE[bid * E_size]); - void* const src = (void*)&(E[bid * strideE]); - size_t const sizeBytes = sizeof(S) * E_size; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - } - - // ------------------------------------------------- + // -------------------------------------- // Execute for each instance in the batch - // ------------------------------------------------- - HIP_CHECK(hipStreamSynchronize(stream)); - - S* dwork_ = nullptr; - HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n))); - + // -------------------------------------- for(I bid = 0; bid < batch_count; bid++) { - if(linfo_array[bid] != 0) - { + if(hInfo[bid][0] != 0) continue; - } char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; - S* d_ = &(hD[bid * n]); - S* e_ = &(hE[bid * E_size]); - - T* v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; - T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; - T* c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; - S* work_ = &(hwork[0]); - I info = 0; - I nru = nu; - I ncc = nc; - - // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V - // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv" - // but it is actually holding "VT" - T* vt_ = v_; - I ldvt = ldv; - - I nrv = n; - I ncvt = nv; - bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); - - bdsqr_single_template(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, - c_, ldc, work_, info, dwork_, stream); + bdsqr_single_template(handle, uplo, n, nv, nu, nc, hD[bid], hE[bid], hV[bid], ldv, + hU[bid], ldu, hC[bid], ldc, hwork, info, dwork, stream); if(info == 0) { @@ -1563,80 +1409,31 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, S const zero = S(0); for(I i = 0; i < (n - 1); i++) { - e_[i] = zero; + hE[bid][i] = zero; } } - if(linfo_array[bid] == 0) + if(hInfo[bid][0] == 0) { - linfo_array[bid] = info; + hInfo[bid][0] = info; } } // end for bid - // ------------------------------------------------- - // transfer arrays D(:) and E(:) from host to device - // ------------------------------------------------- - if(use_single_copy_for_D) - { - void* const src = (void*)&(hD[0]); - void* const dst = (void*)D; - size_t const sizeBytes = sizeof(S) * n * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - else - { - for(I bid = 0; bid < batch_count; bid++) - { - void* const src = (void*)&(hD[bid * n]); - void* const dst = (void*)(D + bid * strideD); - size_t const sizeBytes = sizeof(S) * n; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - } - - { - for(I bid = 0; bid < batch_count; bid++) - { - void* const src = (void*)&(hE[bid * E_size]); - void* const dst = (void*)&(E[bid * strideE]); - size_t const sizeBytes = sizeof(S) * E_size; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - } - - // ------------------------------------------------------ - // copy linfo_array[] from host to info_array[] on device - // ------------------------------------------------------ - { - void* const src = (void*)&(linfo_array[0]); - void* const dst = (void*)&(info_array[0]); - size_t const nbytes = sizeof(I) * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } - + // ----------------------------------- + // transfer arrays from host to device + // ----------------------------------- + ROCBLAS_CHECK(hD.write_to_device_async(stream)); + ROCBLAS_CHECK(hE.write_to_device_async(stream)); + ROCBLAS_CHECK(hInfo.write_to_device_async(stream)); HIP_CHECK(hipStreamSynchronize(stream)); // ---------------------- // free allocated storage // ---------------------- HIP_CHECK(hipHostFree(hwork)); - hwork = nullptr; - HIP_CHECK(hipHostFree(hD)); - hD = nullptr; - HIP_CHECK(hipHostFree(hE)); - hE = nullptr; - HIP_CHECK(hipFree(dwork_)); - dwork_ = nullptr; - HIP_CHECK(hipHostFree(linfo_array)); - linfo_array = nullptr; - - return (rocblas_status_success); + HIP_CHECK(hipFree(dwork)); + + return rocblas_status_success; } ROCSOLVER_END_NAMESPACE diff --git a/library/src/auxiliary/rocauxiliary_sterf.hpp b/library/src/auxiliary/rocauxiliary_sterf.hpp index 10922e16b..338ec2327 100644 --- a/library/src/auxiliary/rocauxiliary_sterf.hpp +++ b/library/src/auxiliary/rocauxiliary_sterf.hpp @@ -35,6 +35,7 @@ #include "lapack_device_functions.hpp" #include "rocblas.hpp" #include "rocsolver/rocsolver.h" +#include "rocsolver_hybrid_array.hpp" ROCSOLVER_BEGIN_NAMESPACE @@ -46,21 +47,16 @@ ROCSOLVER_BEGIN_NAMESPACE /** STERF_SQ_E squares the elements of E **/ template -__device__ void sterf_sq_e(const rocblas_int start, const rocblas_int end, T* E) +__device__ __host__ void sterf_sq_e(const rocblas_int start, const rocblas_int end, T* E) { for(int i = start; i < end; i++) E[i] = E[i] * E[i]; } -/** STERF_KERNEL implements the main loop of the sterf algorithm - to compute the eigenvalues of a symmetric tridiagonal matrix given by D - and E **/ template -ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, - T* DD, - const rocblas_stride strideD, - T* EE, - const rocblas_stride strideE, +__device__ __host__ void run_sterf(const rocblas_int n, + T* D, + T* E, rocblas_int* info, rocblas_int* stack, const rocblas_int max_iters, @@ -68,11 +64,6 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, const T ssfmin, const T ssfmax) { - rocblas_int bid = hipBlockIdx_x; - - T* D = DD + (bid * strideD); - T* E = EE + (bid * strideE); - rocblas_int m, l, lsv, lend, lendsv; rocblas_int l1 = 0; rocblas_int iters = 0; @@ -85,7 +76,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, E[l1 - 1] = 0; for(m = l1; m < n - 1; m++) { - if(abs(E[m]) <= sqrt(abs(D[m])) * sqrt(abs(D[m + 1])) * eps) + if(std::abs(E[m]) <= std::sqrt(std::abs(D[m])) * std::sqrt(std::abs(D[m + 1])) * eps) { E[m] = 0; break; @@ -109,7 +100,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, sterf_sq_e(l, lend, E); // Choose iteration type (QL or QR) - if(abs(D[lend]) < abs(D[l])) + if(std::abs(D[lend]) < std::abs(D[l])) { lend = lsv; l = lendsv; @@ -122,7 +113,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, { // Find small subdiagonal element for(m = l; m <= lend - 1; m++) - if(abs(E[m]) <= eps * eps * abs(D[m] * D[m + 1])) + if(std::abs(E[m]) <= eps * eps * std::abs(D[m] * D[m + 1])) break; if(m < lend) @@ -137,7 +128,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, { // Use lae2 to compute 2x2 eigenvalues. Using rte, rt1, rt2. T rte, rt1, rt2; - rte = sqrt(E[l]); + rte = std::sqrt(E[l]); lae2(D[l], rte, D[l + 1], rt1, rt2); D[l] = rt1; D[l + 1] = rt2; @@ -153,12 +144,12 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, T sigma, gamma, r, rte, c, s; // Form shift - rte = sqrt(E[l]); + rte = std::sqrt(E[l]); sigma = (D[l + 1] - p) / (2 * rte); if(sigma >= 0) - r = abs(sqrt(1 + sigma * sigma)); + r = std::abs(sqrt(1 + sigma * sigma)); else - r = -abs(sqrt(1 + sigma * sigma)); + r = -std::abs(sqrt(1 + sigma * sigma)); sigma = p - (rte / (sigma + r)); c = 1; @@ -198,7 +189,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, { // Find small subdiagonal element for(m = l; m >= lend + 1; m--) - if(abs(E[m - 1]) <= eps * eps * abs(D[m] * D[m - 1])) + if(std::abs(E[m - 1]) <= eps * eps * std::abs(D[m] * D[m - 1])) break; if(m > lend) @@ -213,7 +204,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, { // Use lae2 to compute 2x2 eigenvalues. Using rte, rt1, rt2. T rte, rt1, rt2; - rte = sqrt(E[l - 1]); + rte = std::sqrt(E[l - 1]); lae2(D[l], rte, D[l - 1], rt1, rt2); D[l] = rt1; D[l - 1] = rt2; @@ -229,12 +220,12 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, T sigma, gamma, r, rte, c, s; // Form shift. Using rte, r, c, s. - rte = sqrt(E[l - 1]); + rte = std::sqrt(E[l - 1]); sigma = (D[l - 1] - p) / (2 * rte); if(sigma >= 0) - r = abs(sqrt(1 + sigma * sigma)); + r = std::abs(std::sqrt(1 + sigma * sigma)); else - r = -abs(sqrt(1 + sigma * sigma)); + r = -std::abs(std::sqrt(1 + sigma * sigma)); sigma = p - (rte / (sigma + r)); c = 1; @@ -277,14 +268,14 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, // Check for convergence for(int i = 0; i < n - 1; i++) if(E[i] != 0) - info[bid]++; + (*info)++; // Sort eigenvalues /** (TODO: the quick-sort method implemented in lasrt_increasing fails for some cases. Substituting it here with a simple sorting algorithm. If more performance is required in the future, lasrt_increasing should be debugged or another quick-sort method could be implemented) **/ - //lasrt_increasing(n, D, stack + bid * (2 * 32)); + //lasrt_increasing(n, D, stack); for(int ii = 1; ii < n; ii++) { @@ -307,6 +298,32 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, } } +/** STERF_KERNEL implements the main loop of the sterf algorithm + to compute the eigenvalues of a symmetric tridiagonal matrix given by D + and E **/ +template +ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n, + T* DD, + const rocblas_stride strideD, + T* EE, + const rocblas_stride strideE, + rocblas_int* infoA, + rocblas_int* stackA, + const rocblas_int max_iters, + const T eps, + const T ssfmin, + const T ssfmax) +{ + rocblas_int bid = hipBlockIdx_x; + + T* D = DD + (bid * strideD); + T* E = EE + (bid * strideE); + rocblas_int* info = infoA + bid; + rocblas_int* stack = stackA + bid * (2 * 32); + + run_sterf(n, D, E, info, stack, max_iters, eps, ssfmin, ssfmax); +} + template void rocsolver_sterf_getMemorySize(const rocblas_int n, const rocblas_int batch_count, @@ -368,6 +385,9 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle, hipStream_t stream; rocblas_get_stream(handle, &stream); + rocsolver_alg_mode alg_mode; + ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode)); + rocblas_int blocksReset = (batch_count - 1) / BS1 + 1; dim3 gridReset(blocksReset, 1, 1); dim3 threads(BS1, 1, 1); @@ -385,8 +405,33 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle, ssfmin = sqrt(ssfmin) / (eps * eps); ssfmax = sqrt(ssfmax) / T(3.0); - ROCSOLVER_LAUNCH_KERNEL(sterf_kernel, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD, - strideD, E + shiftE, strideE, info, stack, 30 * n, eps, ssfmin, ssfmax); + if(alg_mode == rocsolver_alg_mode_hybrid) + { + rocsolver_hybrid_array hD; + rocsolver_hybrid_array hE; + rocsolver_hybrid_array hInfo; + + ROCBLAS_CHECK(hD.init_async(n, D + shiftD, strideD, batch_count, stream)); + ROCBLAS_CHECK(hE.init_async(n - 1, E + shiftE, strideE, batch_count, stream)); + ROCBLAS_CHECK(hInfo.init_async(1, info, 1, batch_count, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + for(rocblas_int b = 0; b < batch_count; b++) + { + run_sterf(n, hD[b], hE[b], hInfo[b], nullptr, 30 * n, eps, ssfmin, ssfmax); + } + + ROCBLAS_CHECK(hD.write_to_device_async(stream)); + ROCBLAS_CHECK(hE.write_to_device_async(stream)); + ROCBLAS_CHECK(hInfo.write_to_device_async(stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + else + { + ROCSOLVER_LAUNCH_KERNEL(sterf_kernel, dim3(batch_count), dim3(1), 0, stream, n, + D + shiftD, strideD, E + shiftE, strideE, info, stack, 30 * n, eps, + ssfmin, ssfmax); + } return rocblas_status_success; } diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp index 949d60fd2..b3680e5b7 100644 --- a/library/src/common/rocsolver_handle.cpp +++ b/library/src/common/rocsolver_handle.cpp @@ -66,6 +66,12 @@ rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle, handle_data->bdsqr_mode = mode; return rocblas_status_success; } + case rocsolver_function_sterf: + if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid) + { + handle_data->sterf_mode = mode; + return rocblas_status_success; + } } return rocblas_status_invalid_value; @@ -95,6 +101,7 @@ rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle, { case rocsolver_function_gesvd: case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break; + case rocsolver_function_sterf: *mode = handle_data->sterf_mode; break; default: return rocblas_status_invalid_value; } } diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index 6e4760c44..e766596ef 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -363,7 +363,7 @@ __device__ void lasr(const rocblas_side side, [ a b ] [ b c ] **/ template , int> = 0> -__device__ void lae2(T& a, T& b, T& c, T& rt1, T& rt2) +__device__ __host__ void lae2(T& a, T& b, T& c, T& rt1, T& rt2) { T sm = a + c; T adf = abs(a - c); diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp index dc011f03a..08155d0e0 100644 --- a/library/src/include/lib_device_helpers.hpp +++ b/library/src/include/lib_device_helpers.hpp @@ -98,7 +98,7 @@ __device__ void /** FIND_MAX_TRIDIAG finds the element with the largest magnitude in the tridiagonal matrix **/ template -__device__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E) +__device__ __host__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E) { T anorm = abs(D[end]); for(int i = start; i < end; i++) @@ -109,7 +109,8 @@ __device__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T* /** SCALE_TRIDIAG scales the elements of the tridiagonal matrix by a given scale factor **/ template -__device__ void scale_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E, T scale) +__device__ __host__ void + scale_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E, T scale) { D[end] *= scale; for(int i = start; i < end; i++) @@ -119,1122 +120,1124 @@ __device__ void scale_tridiag(const rocblas_int start, const rocblas_int end, T* } } -// ********************************************************** -// GPU kernels that are used by many rocsolver functions -// ********************************************************** - -enum copymat_direction -{ - copymat_to_buffer, - copymat_from_buffer -}; - -/** A mask that is always true. Typically used to make the mask optional, - by acting as the default when no other mask is provided. **/ -struct no_mask -{ - __device__ constexpr bool operator[](rocblas_int) const noexcept - { - return true; - } -}; - -/** An mask defined by an integer array (e.g., the info array) **/ -struct info_mask +template +__device__ static void shell_sort_ascending(const I n, S* a, I* map = nullptr) { - enum mask_transform + // ----------------------------------------------- + // Sort array a[0...(n-1)] and generate permutation vector + // in map[] if map[] is available + // Note: performs in a single thread block + // ----------------------------------------------- + if((n <= 0) || (a == nullptr)) { - none, - negate + return; }; - explicit constexpr info_mask(rocblas_int* mask, mask_transform transform = none) noexcept - : m_mask(mask) - , m_negate(transform == negate) - { - } - - __device__ constexpr bool operator[](rocblas_int idx) const noexcept - { - return m_negate ^ !!m_mask[idx]; - } - - rocblas_int* m_mask; - bool m_negate; -}; - -/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer - An optional mask can be provided to limit the copy to selected matricies in the batch - If uplo = rocblas_fill_upper, only the upper triangular part is copied - If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ -template -ROCSOLVER_KERNEL void copy_mat(copymat_direction direction, - const rocblas_int m, - const rocblas_int n, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T* buffer, - const Mask mask = no_mask{}, - const rocblas_fill uplo = rocblas_fill_full, - const rocblas_diagonal diag = rocblas_diagonal_non_unit) -{ - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - const rocblas_int ldb = m; - const rocblas_stride strideB = rocblas_stride(ldb) * n; - - const bool copy = mask[b]; - - const bool full = (uplo == rocblas_fill_full); - const bool upper = (uplo == rocblas_fill_upper); - const bool lower = (uplo == rocblas_fill_lower); - const bool cdiag = (diag == rocblas_diagonal_non_unit); - - if(i < m && j < n && copy) - { - if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) - { - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* Bp = &buffer[b * strideB]; + bool const is_root_thread + = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0); - if(direction == copymat_to_buffer) - Bp[i + j * ldb] = Ap[i + j * lda]; - else // direction == copymat_from_buffer - Ap[i + j * lda] = Bp[i + j * ldb]; - } - } -} + int constexpr ngaps = 8; + int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence -/** COPY_MAT copies m-by-n array A into B - An optional mask can be provided to limit the copy to selected matricies in the batch - If uplo = rocblas_fill_upper, only the upper triangular part is copied - If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ -template -ROCSOLVER_KERNEL void copy_mat(const rocblas_int m, - const rocblas_int n, - U1 A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - U2 B, - const rocblas_int shiftB, - const rocblas_int ldb, - const rocblas_stride strideB, - const Mask mask = no_mask{}, - const rocblas_fill uplo = rocblas_fill_full, - const rocblas_diagonal diag = rocblas_diagonal_non_unit) -{ - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x + + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - const bool copy = mask[b]; + auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; - const bool full = (uplo == rocblas_fill_full); - const bool upper = (uplo == rocblas_fill_upper); - const bool lower = (uplo == rocblas_fill_lower); - const bool cdiag = (diag == rocblas_diagonal_non_unit); + bool const has_map = (map != nullptr); - if(i < m && j < n && copy) + __syncthreads(); + if(has_map) { - if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) + for(auto k = k_start; k < n; k += k_inc) { - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* Bp = load_ptr_batch(B, b, shiftB, strideB); - - Bp[i + j * ldb] = Ap[i + j * lda]; - } - } -} - -/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer - If uplo = rocblas_fill_upper, only the upper triangular part is copied - If uplo = rocblas_fill_lower, only the lower triangular part is copied - Only valid when A is complex and buffer real. If REAL, only works with real part of A; - if !REAL only works with imaginary part of A**/ -template , int> = 0> -ROCSOLVER_KERNEL void copy_mat(copymat_direction direction, - const rocblas_int m, - const rocblas_int n, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - S* buffer, - const rocblas_fill uplo = rocblas_fill_full, - const rocblas_diagonal diag = rocblas_diagonal_non_unit) -{ - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - const rocblas_int ldb = m; - const rocblas_stride strideB = rocblas_stride(ldb) * n; - - const bool lower = (uplo == rocblas_fill_lower); - const bool full = (uplo == rocblas_fill_full); - const bool upper = (uplo == rocblas_fill_upper); - const bool cdiag = (diag == rocblas_diagonal_non_unit); + map[k] = k; + }; + }; + __syncthreads(); - if(i < m && j < n) + if(is_root_thread) { - if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) + for(auto igap = 0; igap < ngaps; igap++) { - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - S* Bp = &buffer[b * strideB]; - - if(direction == copymat_to_buffer) - Bp[i + j * ldb] = REAL ? Ap[i + j * lda].real() : Ap[i + j * lda].imag(); - else if(REAL) - Ap[i + j * lda] = rocblas_complex_num(Bp[i + j * ldb], Ap[i + j * lda].imag()); - else - Ap[i + j * lda] = rocblas_complex_num(Ap[i + j * lda].real(), Bp[i + j * ldb]); - } - } -} - -/** COPY_TRANS_MAT copies m-by-n array A into B and transpose it depending on the value of trans. - An optional mask can be provided to limit the copy to selected matricies in the batch - If uplo = rocblas_fill_upper, only the upper triangular part is copied - If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ -template -ROCSOLVER_KERNEL void copy_trans_mat(const rocblas_operation trans, - const rocblas_int m, - const rocblas_int n, - T1* A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T2* B, - const rocblas_int shiftB, - const rocblas_int ldb, - const rocblas_stride strideB, - const Mask mask = no_mask{}, - const rocblas_fill uplo = rocblas_fill_full, - const rocblas_diagonal diag = rocblas_diagonal_non_unit) -{ - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - const bool copy = mask[b]; - - const bool full = (uplo == rocblas_fill_full); - const bool upper = (uplo == rocblas_fill_upper); - const bool lower = (uplo == rocblas_fill_lower); - const bool cdiag = (diag == rocblas_diagonal_non_unit); + auto const gap = gaps[igap]; - if(i < m && j < n && copy) - { - if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) - { - T1* Ap = load_ptr_batch(A, b, shiftA, strideA); - T2* Bp = load_ptr_batch(B, b, shiftB, strideB); + for(I i = gap; i < n; i += 1) + { + { + S const temp = a[i]; + auto const itemp = (has_map) ? map[i] : 0; - if(trans == rocblas_operation_conjugate_transpose) - Bp[j + i * ldb] = T2(conj(Ap[i + j * lda])); - else if(trans == rocblas_operation_transpose) - Bp[j + i * ldb] = T2(Ap[i + j * lda]); - else - Bp[i + j * ldb] = T2(Ap[i + j * lda]); - } - } -} + auto j = i; -template -ROCSOLVER_KERNEL void init_ident(const rocblas_int m, - const rocblas_int n, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA) -{ - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto b = hipBlockIdx_z; + while((j >= gap) && (a[j - gap] > temp)) + { + a[j] = a[j - gap]; + if(has_map) + { + map[j] = map[j - gap]; + }; + j -= gap; + }; - if(i < m && j < n) + a[j] = temp; + if(has_map) + { + map[j] = itemp; + }; + }; + }; + }; + }; + __syncthreads(); +#ifdef NDEBUG +#else + if(n >= 2) { - T* a = load_ptr_batch(A, b, shiftA, strideA); - - if(i == j) - a[i + j * lda] = 1.0; - else - a[i + j * lda] = 0.0; - } + for(auto k = k_start; k < (n - 1); k += k_inc) + { + assert(a[k] <= a[k + 1]); + }; + }; + __syncthreads(); +#endif } -template -ROCSOLVER_KERNEL void reset_info(T* info, const I n, U val, I incr = 0) +template +__device__ static void shell_sort_descending(const I n, S* a, I* map = nullptr) { - I idx = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; + // ----------------------------------------------- + // Sort array a[0...(n-1)] and generate permutation vector + // in map[] if map[] is available + // Note: performs in a single thread block + // ----------------------------------------------- - if(idx < n) - info[idx] = T(val) + incr * idx; -} + if((n <= 0) || (a == nullptr)) + { + return; + }; -template -ROCSOLVER_KERNEL void reset_batch_info(U info, const rocblas_stride stride, const I n, S val) -{ - I idx = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; - I b = hipBlockIdx_y; + bool const is_root_thread + = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0); - T* inf = load_ptr_batch(info, b, 0, stride); - if(idx < n) - inf[idx] = T(val); -} + int constexpr ngaps = 8; + int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence -template -ROCSOLVER_KERNEL void get_array(T** out, T* in, rocblas_stride stride, I batch) -{ - I b = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; + auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x + + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - if(b < batch) - out[b] = in + b * stride; -} + auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; -template -ROCSOLVER_KERNEL void shift_array(T** out, U in, rocblas_stride shift, I batch) -{ - I b = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; + bool const has_map = (map != nullptr); - if(b < batch) - out[b] = in[b] + shift; -} + __syncthreads(); + if(has_map) + { + for(auto k = k_start; k < n; k += k_inc) + { + map[k] = k; + }; + }; + __syncthreads(); -template -ROCSOLVER_KERNEL void subtract_tau(const rocblas_int i, - const rocblas_int j, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T* ipiv, - const rocblas_stride strideP) -{ - const auto b = hipBlockIdx_x; - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* tau = ipiv + b * strideP; + if(is_root_thread) + { + for(auto igap = 0; igap < ngaps; igap++) + { + auto const gap = gaps[igap]; - T t = -(*tau); - *tau = t; - Ap[i + j * lda] = 1.0 + t; -} + for(rocblas_int i = gap; i < n; i += 1) + { + { + S const temp = a[i]; + auto const itemp = (has_map) ? map[i] : 0; -template -ROCSOLVER_KERNEL void restau(const rocblas_int k, T* ipiv, const rocblas_stride strideP) -{ - const auto blocksizex = hipBlockDim_x; - const auto b = hipBlockIdx_y; - T* tau = ipiv + b * strideP; - const auto i = hipBlockIdx_x * blocksizex + hipThreadIdx_x; + auto j = i; - if(i < k) - tau[i] = -tau[i]; + while((j >= gap) && (a[j - gap] < temp)) + { + a[j] = a[j - gap]; + if(has_map) + { + map[j] = map[j - gap]; + }; + j -= gap; + }; + + a[j] = temp; + if(has_map) + { + map[j] = itemp; + }; + }; + }; + }; + }; + __syncthreads(); +#ifdef NDEBUG +#else + if(n >= 2) + { + for(auto k = k_start; k < (n - 1); k += k_inc) + { + assert(a[k] >= a[k + 1]); + }; + }; + __syncthreads(); +#endif } -template || rocblas_is_complex, int> = 0> -ROCSOLVER_KERNEL void set_diag(S* D, - const rocblas_stride shiftd, - const rocblas_stride strided, - U A, - const rocblas_stride shifta, - const I lda, - const rocblas_stride stridea, - const I n, - bool set_one) +template +__device__ static void shell_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true) { - I b = hipBlockIdx_x; - I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - I j = i + i * lda; - - S* d = load_ptr_batch(D, b, shiftd, strided); - T* a = load_ptr_batch(A, b, shifta, stridea); + if((n <= 0) || (a == nullptr)) + { + return; + }; - if(i < n) + if(is_ascending) { - d[i] = a[j]; - a[j] = set_one ? T(1) : a[j]; + shell_sort_ascending(n, a, map); } + else + { + shell_sort_descending(n, a, map); + }; } -template && !rocblas_is_complex, int> = 0> -ROCSOLVER_KERNEL void set_diag(S* D, - const rocblas_stride shiftd, - const rocblas_stride strided, - U A, - const rocblas_stride shifta, - const I lda, - const rocblas_stride stridea, - const I n, - bool set_one) +template +__device__ static void selection_sort_ascending(const I n, S* D, I* map = nullptr) { - I b = hipBlockIdx_x; - I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - I j = i + i * lda; + // --------------------------------------------------- + // Sort entries in D[0...(n-1)] + // generates permutation vector in map[] if map[] is available + // Note: performs in a single thread block + // --------------------------------------------------- + if((n <= 0) || (D == nullptr)) + { + return; + }; - S* d = load_ptr_batch(D, b, shiftd, strided); - T* a = load_ptr_batch(A, b, shifta, stridea); + auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x + + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - if(i < n) - { - d[i] = a[j].real(); - a[j] = set_one ? T(1) : a[j]; - } -} + auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; -template -ROCSOLVER_KERNEL void restore_diag(S* D, - const rocblas_stride shiftd, - const rocblas_stride strided, - U A, - const rocblas_stride shifta, - const I lda, - const rocblas_stride stridea, - const I n) -{ - I b = hipBlockIdx_x; - I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - I j = i + i * lda; + auto const k_start = tid; + auto const k_inc = nthreads; + bool const is_root_thread = (tid == 0); - S* d = load_ptr_batch(D, b, shiftd, strided); - T* a = load_ptr_batch(A, b, shifta, stridea); + bool const has_map = (map != nullptr); - if(i < n) - a[j] = d[i]; -} + __syncthreads(); -/** SET_ZERO inserts zeros in all the entries of a m-by-n matrix A. - If uplo = lower, the lower triangular part of A is kept unchanged. - If uplo = upper, the upper triangular part of A is kept unchanged **/ -template -ROCSOLVER_KERNEL void set_zero(const rocblas_int m, - const rocblas_int n, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - const rocblas_fill uplo = rocblas_fill_full) -{ - const auto b = hipBlockIdx_z; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const bool lower = (uplo == rocblas_fill_lower); - const bool full = (uplo == rocblas_fill_full); - const bool upper = (uplo == rocblas_fill_upper); + if(has_map) + { + for(auto k = k_start; k < n; k += k_inc) + { + map[k] = k; + }; + }; + + __syncthreads(); - if(i < m && j < n) + if(is_root_thread) { - if(full || (lower && j > i) || (upper && i > j)) + for(I ii = 1; ii < n; ii++) { - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - Ap[i + j * lda] = 0.0; + auto l = ii - 1; + auto m = l; + auto p = D[l]; + auto ip = (has_map) ? map[l] : 0; + + for(auto j = ii; j < n; j++) + { + if(D[j] < p) + { + m = j; + p = D[j]; + if(has_map) + { + ip = map[j]; + }; + } + } + if(m != l) + { + D[m] = D[l]; + D[l] = p; + + if(has_map) + { + map[m] = map[l]; + map[l] = ip; + }; + } } } + __syncthreads(); +#ifdef NDEBUG +#else + if(n >= 2) + { + for(auto k = k_start; k < (n - 1); k += k_inc) + { + assert(D[k] <= D[k + 1]); + }; + }; + __syncthreads(); +#endif } -template -ROCSOLVER_KERNEL void copyshift_right(const bool copy, - const rocblas_int dim, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T* W, - const rocblas_int shiftW, - const rocblas_int ldw, - const rocblas_stride strideW) +template +__device__ static void selection_sort_descending(const I n, S* D, I* map = nullptr) { - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + // --------------------------------------------------- + // Sort entries in D[0...(n-1)] + // generates permutation vector in map[] if map[] is available + // Note: performs in a single thread block + // --------------------------------------------------- + if((n <= 0) || (D == nullptr)) + { + return; + }; - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* Wp = load_ptr_batch(W, b, shiftW, strideW); + auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x + + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - // make first row the identity - if(i == 0 && j == 0 && !copy) - Ap[0] = 1.0; + auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; - if(i < dim && j < dim && j <= i) + auto const k_start = tid; + auto const k_inc = nthreads; + bool const is_root_thread = (tid == 0); + + bool const has_map = (map != nullptr); + + __syncthreads(); + + if(has_map) { - rocblas_int offset = j * (j + 1) / 2; // to acommodate in smaller array W + for(auto k = k_start; k < n; k += k_inc) + { + map[k] = k; + }; + }; - if(copy) + __syncthreads(); + + if(is_root_thread) + { + for(I ii = 1; ii < n; ii++) { - // copy columns - Wp[i + j * ldw - offset] = (j == 0 ? 0.0 : Ap[i + 1 + (j - 1) * lda]); + auto l = ii - 1; + auto m = l; + auto p = D[l]; + auto ip = (has_map) ? map[l] : 0; + + for(auto j = ii; j < n; j++) + { + if(D[j] > p) + { + m = j; + p = D[j]; + if(has_map) + { + ip = map[j]; + }; + } + } + if(m != l) + { + D[m] = D[l]; + D[l] = p; + + if(has_map) + { + map[m] = map[l]; + map[l] = ip; + }; + } } - else + } + __syncthreads(); +#ifdef NDEBUG +#else + if(n >= 2) + { + for(auto k = k_start; k < (n - 1); k += k_inc) { - // shift columns to the right - Ap[i + 1 + j * lda] = Wp[i + j * ldw - offset]; + assert(D[k] >= D[k + 1]); + }; + }; + __syncthreads(); +#endif +} - // make first row the identity - if(i == j) - Ap[(j + 1) * lda] = 0.0; - } +template +__device__ static void selection_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true) +{ + if((n <= 0) || (a == nullptr)) + { + return; + }; + + if(is_ascending) + { + selection_sort_ascending(n, a, map); } + else + { + selection_sort_descending(n, a, map); + }; } -template -ROCSOLVER_KERNEL void copyshift_left(const bool copy, - const rocblas_int dim, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T* W, - const rocblas_int shiftW, - const rocblas_int ldw, - const rocblas_stride strideW) +template +__device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev = -1) { - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + // -------------------------------------------- + // perform swaps to implement permutation vector + // the permutation vector will be restored to the + // identity permutation 0,1,2,... + // + // Note: performs in a thread block + // -------------------------------------------- + { + bool const has_work = (n >= 1) && (C != nullptr) && (ldc >= 1) && (map != nullptr); + if(!has_work) + { + return; + } + } - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* Wp = load_ptr_batch(W, b, shiftW, strideW); + auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x + + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - // make last row the identity - if(i == 0 && j == 0 && !copy) - Ap[dim + dim * lda] = 1.0; + auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; - if(i < dim && j < dim && i <= j) + auto const k_start = tid; + auto const k_inc = nthreads; + + bool const is_root_thread = (tid == 0); + + auto const nn = (nev >= 0) ? nev : n; + + for(I i = 0; i < nn; i++) { - rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W + __syncthreads(); - if(copy) - { - // copy columns - Wp[i + j * ldw - offset] = (j == dim - 1 ? 0.0 : Ap[i + (j + 2) * lda]); - } - else + while(map[i] != i) { - // shift columns to the left - Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset]; - - // make last row the identity - if(i == j) - Ap[dim + j * lda] = 0.0; - } - } -} + auto const map_i = map[i]; + auto const map_ii = map[map[i]]; -template -ROCSOLVER_KERNEL void copyshift_down(const bool copy, - const rocblas_int dim, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - T* W, - const rocblas_int shiftW, - const rocblas_int ldw, - const rocblas_stride strideW) -{ - const auto b = hipBlockIdx_z; - const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + __syncthreads(); - T* Ap = load_ptr_batch(A, b, shiftA, strideA); - T* Wp = load_ptr_batch(W, b, shiftW, strideW); + if(is_root_thread) + { + map[map_i] = map_i; + map[i] = map_ii; + } - // make first column the identity - if(i == 0 && j == 0 && !copy) - Ap[0] = 1.0; + __syncthreads(); - if(i < dim && j < dim && i <= j) - { - rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W + // --------------------------------------- + // swap columns C( 0:(n-1),map_i) and C( 0:(n-1),map_ii) + // --------------------------------------- - if(copy) - { - // copy rows - Wp[i + j * ldw - offset] = (i == 0 ? 0.0 : Ap[i - 1 + (j + 1) * lda]); - } - else - { - // shift rows downward - Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset]; + for(auto k = k_start; k < n; k += k_inc) + { + auto const k_map_i = k + (map_i * static_cast(ldc)); + auto const k_map_ii = k + (map_ii * static_cast(ldc)); - // make first column the identity - if(i == j) - Ap[i + 1] = 0.0; + auto const ctemp = C[k_map_i]; + C[k_map_i] = C[k_map_ii]; + C[k_map_ii] = ctemp; + } + + __syncthreads(); } } -} - -/** set_offdiag kernel copies the off-diagonal element of A, which is the non-zero element - resulting by applying the Householder reflector to the working column, to E. Then set it - to 1 to prepare for the application of the Householder reflector to the rest of the matrix **/ -template , int> = 0> -ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count, - U A, - const rocblas_int shiftA, - const rocblas_stride strideA, - S* E, - const rocblas_stride strideE) -{ - rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - if(b < batch_count) +#ifdef NDEBUG +#else + // ---------------------------------------------------------- + // extra check that map[] is restored to identity permutation + // ---------------------------------------------------------- + __syncthreads(); + for(auto k = k_start; k < nn; k += k_inc) { - T* a = load_ptr_batch(A, b, shiftA, strideA); - S* e = E + b * strideE; - - e[0] = a[0]; - a[0] = T(1); + assert(map[k] == k); } + __syncthreads(); +#endif } -template , int> = 0> -ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count, - U A, - const rocblas_int shiftA, - const rocblas_stride strideA, - S* E, - const rocblas_stride strideE) +// ********************************************************** +// GPU kernels that are used by many rocsolver functions +// ********************************************************** + +enum copymat_direction { - rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + copymat_to_buffer, + copymat_from_buffer +}; - if(b < batch_count) +/** A mask that is always true. Typically used to make the mask optional, by acting as the default when + no other mask is provided. **/ +struct no_mask +{ + __device__ constexpr bool operator[](rocblas_int) const noexcept { - T* a = load_ptr_batch(A, b, shiftA, strideA); - S* e = E + b * strideE; - - e[0] = a[0].real(); - a[0] = T(1); + return true; } -} +}; -/** scale_axpy kernel executes axpy to update tau computing the scalar alpha with other - results in different memopry locations **/ -template -ROCSOLVER_KERNEL void scale_axpy(const rocblas_int n, - T* scl, - T* S, - const rocblas_stride strideS, - U A, - const rocblas_int shiftA, - const rocblas_stride strideA, - T* W, - const rocblas_int shiftW, - const rocblas_stride strideW) +/** An mask defined by an integer array (e.g., the info array). By default, a non-zero value for the ith integer + indicates that the data for batch i should be copied over. Data will not be copied if the mask value is zero. + This behaviour is reversed if the negate transform is passed to the constructor. **/ +struct info_mask { - rocblas_int b = hipBlockIdx_y; - rocblas_int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - - if(i < n) + enum mask_transform { - T* v = load_ptr_batch(A, b, shiftA, strideA); - T* w = load_ptr_batch(W, b, shiftW, strideW); - T* s = S + b * strideS; + none, + negate + }; - // scale - T alpha = -0.5 * s[0] * scl[b]; + explicit constexpr info_mask(rocblas_int* mask, mask_transform transform = none) noexcept + : m_mask(mask) + , m_negate(transform == negate) + { + } - // axpy - w[i] = alpha * v[i] + w[i]; + __device__ constexpr bool operator[](rocblas_int idx) const noexcept + { + return m_negate ^ !!m_mask[idx]; } -} -template -ROCSOLVER_KERNEL void check_singularity(const rocblas_int n, - U A, - const rocblas_int shiftA, - const rocblas_int lda, - const rocblas_stride strideA, - rocblas_int* info) + rocblas_int* m_mask; + bool m_negate; +}; + +/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer + An optional mask can be provided to limit the copy to selected matricies in the batch + If uplo = rocblas_fill_upper, only the upper triangular part is copied + If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ +template +ROCSOLVER_KERNEL void copy_mat(copymat_direction direction, + const rocblas_int m, + const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T* buffer, + const Mask mask = no_mask{}, + const rocblas_fill uplo = rocblas_fill_full, + const rocblas_diagonal diag = rocblas_diagonal_non_unit) { - // Checks for singularities in the matrix and updates info to indicate where - // the first singularity (if any) occurs - int b = hipBlockIdx_x; + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - T* a = load_ptr_batch(A, b, shiftA, strideA); + const rocblas_int ldb = m; + const rocblas_stride strideB = rocblas_stride(ldb) * n; - __shared__ rocblas_int _info; + const bool copy = mask[b]; - if(hipThreadIdx_y == 0) - _info = 0; - __syncthreads(); + const bool full = (uplo == rocblas_fill_full); + const bool upper = (uplo == rocblas_fill_upper); + const bool lower = (uplo == rocblas_fill_lower); + const bool cdiag = (diag == rocblas_diagonal_non_unit); - for(int i = hipThreadIdx_y; i < n; i += hipBlockDim_y) + if(i < m && j < n && copy) { - if(a[i + i * lda] == 0) + if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) { - rocblas_int _info_temp = _info; - while(_info_temp == 0 || _info_temp > i + 1) - _info_temp = atomicCAS(&_info, _info_temp, i + 1); + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* Bp = &buffer[b * strideB]; + + if(direction == copymat_to_buffer) + Bp[i + j * ldb] = Ap[i + j * lda]; + else // direction == copymat_from_buffer + Ap[i + j * lda] = Bp[i + j * ldb]; } } - __syncthreads(); - - if(hipThreadIdx_y == 0) - info[b] = _info; } -template -__device__ static void shell_sort_ascending(const I n, S* a, I* map = nullptr) +/** COPY_MAT copies m-by-n array A into B + An optional mask can be provided to limit the copy to selected matricies in the batch + If uplo = rocblas_fill_upper, only the upper triangular part is copied + If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ +template +ROCSOLVER_KERNEL void copy_mat(const rocblas_int m, + const rocblas_int n, + U1 A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + U2 B, + const rocblas_int shiftB, + const rocblas_int ldb, + const rocblas_stride strideB, + const Mask mask = no_mask{}, + const rocblas_fill uplo = rocblas_fill_full, + const rocblas_diagonal diag = rocblas_diagonal_non_unit) { - // ----------------------------------------------- - // Sort array a[0...(n-1)] and generate permutation vector - // in map[] if map[] is available - // Note: performs in a single thread block - // ----------------------------------------------- - if((n <= 0) || (a == nullptr)) - { - return; - }; + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - bool const is_root_thread - = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0); + const bool copy = mask[b]; - int constexpr ngaps = 8; - int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence + const bool full = (uplo == rocblas_fill_full); + const bool upper = (uplo == rocblas_fill_upper); + const bool lower = (uplo == rocblas_fill_lower); + const bool cdiag = (diag == rocblas_diagonal_non_unit); - auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x - + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); + if(i < m && j < n && copy) + { + if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) + { + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* Bp = load_ptr_batch(B, b, shiftB, strideB); + + Bp[i + j * ldb] = Ap[i + j * lda]; + } + } +} + +/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer + If uplo = rocblas_fill_upper, only the upper triangular part is copied + If uplo = rocblas_fill_lower, only the lower triangular part is copied + Only valid when A is complex and buffer real. If REAL, only works with real part of A; + if !REAL only works with imaginary part of A**/ +template , int> = 0> +ROCSOLVER_KERNEL void copy_mat(copymat_direction direction, + const rocblas_int m, + const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + S* buffer, + const rocblas_fill uplo = rocblas_fill_full, + const rocblas_diagonal diag = rocblas_diagonal_non_unit) +{ + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; + const rocblas_int ldb = m; + const rocblas_stride strideB = rocblas_stride(ldb) * n; - bool const has_map = (map != nullptr); + const bool lower = (uplo == rocblas_fill_lower); + const bool full = (uplo == rocblas_fill_full); + const bool upper = (uplo == rocblas_fill_upper); + const bool cdiag = (diag == rocblas_diagonal_non_unit); - __syncthreads(); - if(has_map) + if(i < m && j < n) { - for(auto k = k_start; k < n; k += k_inc) + if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) { - map[k] = k; - }; - }; - __syncthreads(); + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + S* Bp = &buffer[b * strideB]; - if(is_root_thread) - { - for(auto igap = 0; igap < ngaps; igap++) - { - auto const gap = gaps[igap]; + if(direction == copymat_to_buffer) + Bp[i + j * ldb] = REAL ? Ap[i + j * lda].real() : Ap[i + j * lda].imag(); + else if(REAL) + Ap[i + j * lda] = rocblas_complex_num(Bp[i + j * ldb], Ap[i + j * lda].imag()); + else + Ap[i + j * lda] = rocblas_complex_num(Ap[i + j * lda].real(), Bp[i + j * ldb]); + } + } +} - for(I i = gap; i < n; i += 1) - { - { - S const temp = a[i]; - auto const itemp = (has_map) ? map[i] : 0; +/** COPY_TRANS_MAT copies m-by-n array A into B and transpose it depending on the value of trans. + An optional mask can be provided to limit the copy to selected matricies in the batch + If uplo = rocblas_fill_upper, only the upper triangular part is copied + If uplo = rocblas_fill_lower, only the lower triangular part is copied **/ +template +ROCSOLVER_KERNEL void copy_trans_mat(const rocblas_operation trans, + const rocblas_int m, + const rocblas_int n, + T1* A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T2* B, + const rocblas_int shiftB, + const rocblas_int ldb, + const rocblas_stride strideB, + const Mask mask = no_mask{}, + const rocblas_fill uplo = rocblas_fill_full, + const rocblas_diagonal diag = rocblas_diagonal_non_unit) +{ + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto j = i; + const bool copy = mask[b]; - while((j >= gap) && (a[j - gap] > temp)) - { - a[j] = a[j - gap]; - if(has_map) - { - map[j] = map[j - gap]; - }; - j -= gap; - }; + const bool full = (uplo == rocblas_fill_full); + const bool upper = (uplo == rocblas_fill_upper); + const bool lower = (uplo == rocblas_fill_lower); + const bool cdiag = (diag == rocblas_diagonal_non_unit); - a[j] = temp; - if(has_map) - { - map[j] = itemp; - }; - }; - }; - }; - }; - __syncthreads(); -#ifdef NDEBUG -#else - if(n >= 2) + if(i < m && j < n && copy) { - for(auto k = k_start; k < (n - 1); k += k_inc) + if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j)) { - assert(a[k] <= a[k + 1]); - }; - }; - __syncthreads(); -#endif + T1* Ap = load_ptr_batch(A, b, shiftA, strideA); + T2* Bp = load_ptr_batch(B, b, shiftB, strideB); + + if(trans == rocblas_operation_conjugate_transpose) + Bp[j + i * ldb] = T2(conj(Ap[i + j * lda])); + else if(trans == rocblas_operation_transpose) + Bp[j + i * ldb] = T2(Ap[i + j * lda]); + else + Bp[i + j * ldb] = T2(Ap[i + j * lda]); + } + } } -template -__device__ static void shell_sort_descending(const I n, S* a, I* map = nullptr) +template +ROCSOLVER_KERNEL void init_ident(const rocblas_int m, + const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA) { - // ----------------------------------------------- - // Sort array a[0...(n-1)] and generate permutation vector - // in map[] if map[] is available - // Note: performs in a single thread block - // ----------------------------------------------- + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto b = hipBlockIdx_z; - if((n <= 0) || (a == nullptr)) + if(i < m && j < n) { - return; - }; + T* a = load_ptr_batch(A, b, shiftA, strideA); - bool const is_root_thread - = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0); + if(i == j) + a[i + j * lda] = 1.0; + else + a[i + j * lda] = 0.0; + } +} - int constexpr ngaps = 8; - int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence +template +ROCSOLVER_KERNEL void reset_info(T* info, const I n, U val, I incr = 0) +{ + I idx = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; - auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x - + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); + if(idx < n) + info[idx] = T(val) + incr * idx; +} - auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; +template +ROCSOLVER_KERNEL void reset_batch_info(U info, const rocblas_stride stride, const I n, S val) +{ + I idx = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; + I b = hipBlockIdx_y; - bool const has_map = (map != nullptr); + T* inf = load_ptr_batch(info, b, 0, stride); + if(idx < n) + inf[idx] = T(val); +} - __syncthreads(); - if(has_map) - { - for(auto k = k_start; k < n; k += k_inc) - { - map[k] = k; - }; - }; - __syncthreads(); +template +ROCSOLVER_KERNEL void get_array(T** out, T* in, rocblas_stride stride, I batch) +{ + I b = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; - if(is_root_thread) - { - for(auto igap = 0; igap < ngaps; igap++) - { - auto const gap = gaps[igap]; + if(b < batch) + out[b] = in + b * stride; +} - for(rocblas_int i = gap; i < n; i += 1) - { - { - S const temp = a[i]; - auto const itemp = (has_map) ? map[i] : 0; +template +ROCSOLVER_KERNEL void shift_array(T** out, U in, rocblas_stride shift, I batch) +{ + I b = hipBlockIdx_x * static_cast(hipBlockDim_x) + hipThreadIdx_x; - auto j = i; + if(b < batch) + out[b] = in[b] + shift; +} - while((j >= gap) && (a[j - gap] < temp)) - { - a[j] = a[j - gap]; - if(has_map) - { - map[j] = map[j - gap]; - }; - j -= gap; - }; +template +ROCSOLVER_KERNEL void subtract_tau(const rocblas_int i, + const rocblas_int j, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T* ipiv, + const rocblas_stride strideP) +{ + const auto b = hipBlockIdx_x; + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* tau = ipiv + b * strideP; - a[j] = temp; - if(has_map) - { - map[j] = itemp; - }; - }; - }; - }; - }; - __syncthreads(); -#ifdef NDEBUG -#else - if(n >= 2) - { - for(auto k = k_start; k < (n - 1); k += k_inc) - { - assert(a[k] >= a[k + 1]); - }; - }; - __syncthreads(); -#endif + T t = -(*tau); + *tau = t; + Ap[i + j * lda] = 1.0 + t; +} + +template +ROCSOLVER_KERNEL void restau(const rocblas_int k, T* ipiv, const rocblas_stride strideP) +{ + const auto blocksizex = hipBlockDim_x; + const auto b = hipBlockIdx_y; + T* tau = ipiv + b * strideP; + const auto i = hipBlockIdx_x * blocksizex + hipThreadIdx_x; + + if(i < k) + tau[i] = -tau[i]; } -template -__device__ static void shell_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true) +template || rocblas_is_complex, int> = 0> +ROCSOLVER_KERNEL void set_diag(S* D, + const rocblas_stride shiftd, + const rocblas_stride strided, + U A, + const rocblas_stride shifta, + const I lda, + const rocblas_stride stridea, + const I n, + bool set_one) { - if((n <= 0) || (a == nullptr)) - { - return; - }; + I b = hipBlockIdx_x; + I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + I j = i + i * lda; - if(is_ascending) + S* d = load_ptr_batch(D, b, shiftd, strided); + T* a = load_ptr_batch(A, b, shifta, stridea); + + if(i < n) { - shell_sort_ascending(n, a, map); + d[i] = a[j]; + a[j] = set_one ? T(1) : a[j]; } - else - { - shell_sort_descending(n, a, map); - }; } -template -__device__ static void selection_sort_ascending(const I n, S* D, I* map = nullptr) +template && !rocblas_is_complex, int> = 0> +ROCSOLVER_KERNEL void set_diag(S* D, + const rocblas_stride shiftd, + const rocblas_stride strided, + U A, + const rocblas_stride shifta, + const I lda, + const rocblas_stride stridea, + const I n, + bool set_one) { - // --------------------------------------------------- - // Sort entries in D[0...(n-1)] - // generates permutation vector in map[] if map[] is available - // Note: performs in a single thread block - // --------------------------------------------------- - if((n <= 0) || (D == nullptr)) - { - return; - }; + I b = hipBlockIdx_x; + I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + I j = i + i * lda; - auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x - + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); + S* d = load_ptr_batch(D, b, shiftd, strided); + T* a = load_ptr_batch(A, b, shifta, stridea); - auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; + if(i < n) + { + d[i] = a[j].real(); + a[j] = set_one ? T(1) : a[j]; + } +} - auto const k_start = tid; - auto const k_inc = nthreads; - bool const is_root_thread = (tid == 0); +template +ROCSOLVER_KERNEL void restore_diag(S* D, + const rocblas_stride shiftd, + const rocblas_stride strided, + U A, + const rocblas_stride shifta, + const I lda, + const rocblas_stride stridea, + const I n) +{ + I b = hipBlockIdx_x; + I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + I j = i + i * lda; - bool const has_map = (map != nullptr); + S* d = load_ptr_batch(D, b, shiftd, strided); + T* a = load_ptr_batch(A, b, shifta, stridea); - __syncthreads(); + if(i < n) + a[j] = d[i]; +} - if(has_map) +/** SET_ZERO inserts zeros in all the entries of a m-by-n matrix A. + If uplo = lower, the lower triangular part of A is kept unchanged. + If uplo = upper, the upper triangular part of A is kept unchanged **/ +template +ROCSOLVER_KERNEL void set_zero(const rocblas_int m, + const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + const rocblas_fill uplo = rocblas_fill_full) +{ + const auto b = hipBlockIdx_z; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const bool lower = (uplo == rocblas_fill_lower); + const bool full = (uplo == rocblas_fill_full); + const bool upper = (uplo == rocblas_fill_upper); + + if(i < m && j < n) { - for(auto k = k_start; k < n; k += k_inc) + if(full || (lower && j > i) || (upper && i > j)) { - map[k] = k; - }; - }; + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + Ap[i + j * lda] = 0.0; + } + } +} - __syncthreads(); +template +ROCSOLVER_KERNEL void copyshift_right(const bool copy, + const rocblas_int dim, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T* W, + const rocblas_int shiftW, + const rocblas_int ldw, + const rocblas_stride strideW) +{ + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if(is_root_thread) + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* Wp = load_ptr_batch(W, b, shiftW, strideW); + + // make first row the identity + if(i == 0 && j == 0 && !copy) + Ap[0] = 1.0; + + if(i < dim && j < dim && j <= i) { - for(I ii = 1; ii < n; ii++) - { - auto l = ii - 1; - auto m = l; - auto p = D[l]; - auto ip = (has_map) ? map[l] : 0; + rocblas_int offset = j * (j + 1) / 2; // to acommodate in smaller array W - for(auto j = ii; j < n; j++) - { - if(D[j] < p) - { - m = j; - p = D[j]; - if(has_map) - { - ip = map[j]; - }; - } - } - if(m != l) - { - D[m] = D[l]; - D[l] = p; + if(copy) + { + // copy columns + Wp[i + j * ldw - offset] = (j == 0 ? 0.0 : Ap[i + 1 + (j - 1) * lda]); + } + else + { + // shift columns to the right + Ap[i + 1 + j * lda] = Wp[i + j * ldw - offset]; - if(has_map) - { - map[m] = map[l]; - map[l] = ip; - }; - } + // make first row the identity + if(i == j) + Ap[(j + 1) * lda] = 0.0; } } - __syncthreads(); -#ifdef NDEBUG -#else - if(n >= 2) - { - for(auto k = k_start; k < (n - 1); k += k_inc) - { - assert(D[k] <= D[k + 1]); - }; - }; - __syncthreads(); -#endif } -template -__device__ static void selection_sort_descending(const I n, S* D, I* map = nullptr) +template +ROCSOLVER_KERNEL void copyshift_left(const bool copy, + const rocblas_int dim, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T* W, + const rocblas_int shiftW, + const rocblas_int ldw, + const rocblas_stride strideW) { - // --------------------------------------------------- - // Sort entries in D[0...(n-1)] - // generates permutation vector in map[] if map[] is available - // Note: performs in a single thread block - // --------------------------------------------------- - if((n <= 0) || (D == nullptr)) + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* Wp = load_ptr_batch(W, b, shiftW, strideW); + + // make last row the identity + if(i == 0 && j == 0 && !copy) + Ap[dim + dim * lda] = 1.0; + + if(i < dim && j < dim && i <= j) { - return; - }; + rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W - auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x - + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); + if(copy) + { + // copy columns + Wp[i + j * ldw - offset] = (j == dim - 1 ? 0.0 : Ap[i + (j + 2) * lda]); + } + else + { + // shift columns to the left + Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset]; - auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; + // make last row the identity + if(i == j) + Ap[dim + j * lda] = 0.0; + } + } +} - auto const k_start = tid; - auto const k_inc = nthreads; - bool const is_root_thread = (tid == 0); +template +ROCSOLVER_KERNEL void copyshift_down(const bool copy, + const rocblas_int dim, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + T* W, + const rocblas_int shiftW, + const rocblas_int ldw, + const rocblas_stride strideW) +{ + const auto b = hipBlockIdx_z; + const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - bool const has_map = (map != nullptr); + T* Ap = load_ptr_batch(A, b, shiftA, strideA); + T* Wp = load_ptr_batch(W, b, shiftW, strideW); - __syncthreads(); + // make first column the identity + if(i == 0 && j == 0 && !copy) + Ap[0] = 1.0; - if(has_map) + if(i < dim && j < dim && i <= j) { - for(auto k = k_start; k < n; k += k_inc) - { - map[k] = k; - }; - }; - - __syncthreads(); + rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W - if(is_root_thread) - { - for(I ii = 1; ii < n; ii++) + if(copy) { - auto l = ii - 1; - auto m = l; - auto p = D[l]; - auto ip = (has_map) ? map[l] : 0; - - for(auto j = ii; j < n; j++) - { - if(D[j] > p) - { - m = j; - p = D[j]; - if(has_map) - { - ip = map[j]; - }; - } - } - if(m != l) - { - D[m] = D[l]; - D[l] = p; + // copy rows + Wp[i + j * ldw - offset] = (i == 0 ? 0.0 : Ap[i - 1 + (j + 1) * lda]); + } + else + { + // shift rows downward + Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset]; - if(has_map) - { - map[m] = map[l]; - map[l] = ip; - }; - } + // make first column the identity + if(i == j) + Ap[i + 1] = 0.0; } } - __syncthreads(); -#ifdef NDEBUG -#else - if(n >= 2) - { - for(auto k = k_start; k < (n - 1); k += k_inc) - { - assert(D[k] >= D[k + 1]); - }; - }; - __syncthreads(); -#endif } -template -__device__ static void selection_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true) +/** set_offdiag kernel copies the off-diagonal element of A, which is the non-zero element + resulting by applying the Householder reflector to the working column, to E. Then set it + to 1 to prepare for the application of the Householder reflector to the rest of the matrix **/ +template , int> = 0> +ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count, + U A, + const rocblas_int shiftA, + const rocblas_stride strideA, + S* E, + const rocblas_stride strideE) { - if((n <= 0) || (a == nullptr)) - { - return; - }; + rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - if(is_ascending) + if(b < batch_count) { - selection_sort_ascending(n, a, map); + T* a = load_ptr_batch(A, b, shiftA, strideA); + S* e = E + b * strideE; + + e[0] = a[0]; + a[0] = T(1); } - else - { - selection_sort_descending(n, a, map); - }; } -template -__device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev = -1) +template , int> = 0> +ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count, + U A, + const rocblas_int shiftA, + const rocblas_stride strideA, + S* E, + const rocblas_stride strideE) { - // -------------------------------------------- - // perform swaps to implement permutation vector - // the permutation vector will be restored to the - // identity permutation 0,1,2,... - // - // Note: performs in a thread block - // -------------------------------------------- - { - bool const has_work = (n >= 1) && (C != nullptr) && (ldc >= 1) && (map != nullptr); - if(!has_work) - { - return; - } - } - - auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x - + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y); - - auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z; + rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - auto const k_start = tid; - auto const k_inc = nthreads; + if(b < batch_count) + { + T* a = load_ptr_batch(A, b, shiftA, strideA); + S* e = E + b * strideE; - bool const is_root_thread = (tid == 0); + e[0] = a[0].real(); + a[0] = T(1); + } +} - auto const nn = (nev >= 0) ? nev : n; +/** scale_axpy kernel executes axpy to update tau computing the scalar alpha with other + results in different memopry locations **/ +template +ROCSOLVER_KERNEL void scale_axpy(const rocblas_int n, + T* scl, + T* S, + const rocblas_stride strideS, + U A, + const rocblas_int shiftA, + const rocblas_stride strideA, + T* W, + const rocblas_int shiftW, + const rocblas_stride strideW) +{ + rocblas_int b = hipBlockIdx_y; + rocblas_int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - for(I i = 0; i < nn; i++) + if(i < n) { - __syncthreads(); - - while(map[i] != i) - { - auto const map_i = map[i]; - auto const map_ii = map[map[i]]; - - __syncthreads(); + T* v = load_ptr_batch(A, b, shiftA, strideA); + T* w = load_ptr_batch(W, b, shiftW, strideW); + T* s = S + b * strideS; - if(is_root_thread) - { - map[map_i] = map_i; - map[i] = map_ii; - } + // scale + T alpha = -0.5 * s[0] * scl[b]; - __syncthreads(); + // axpy + w[i] = alpha * v[i] + w[i]; + } +} - // --------------------------------------- - // swap columns C( 0:(n-1),map_i) and C( 0:(n-1),map_ii) - // --------------------------------------- +template +ROCSOLVER_KERNEL void check_singularity(const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + rocblas_int* info) +{ + // Checks for singularities in the matrix and updates info to indicate where + // the first singularity (if any) occurs + int b = hipBlockIdx_x; - for(auto k = k_start; k < n; k += k_inc) - { - auto const k_map_i = k + (map_i * static_cast(ldc)); - auto const k_map_ii = k + (map_ii * static_cast(ldc)); + T* a = load_ptr_batch(A, b, shiftA, strideA); - auto const ctemp = C[k_map_i]; - C[k_map_i] = C[k_map_ii]; - C[k_map_ii] = ctemp; - } + __shared__ rocblas_int _info; - __syncthreads(); - } - } -#ifdef NDEBUG -#else - // ---------------------------------------------------------- - // extra check that map[] is restored to identity permutation - // ---------------------------------------------------------- + if(hipThreadIdx_y == 0) + _info = 0; __syncthreads(); - for(auto k = k_start; k < nn; k += k_inc) + + for(int i = hipThreadIdx_y; i < n; i += hipBlockDim_y) { - assert(map[k] == k); + if(a[i + i * lda] == 0) + { + rocblas_int _info_temp = _info; + while(_info_temp == 0 || _info_temp > i + 1) + _info_temp = atomicCAS(&_info, _info_temp, i + 1); + } } __syncthreads(); -#endif + + if(hipThreadIdx_y == 0) + info[b] = _info; } /** SWAP swaps the values of vectors x and y of dimension n. diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp index 3f44ccf9b..3d72a0cc9 100644 --- a/library/src/include/lib_host_helpers.hpp +++ b/library/src/include/lib_host_helpers.hpp @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -170,6 +172,20 @@ static double imag_part(rocblas_complex_num z) return (z.imag()); } +static bool is_device_pointer(void* ptr) +{ + hipPointerAttribute_t dev_attributes; + if(ptr == nullptr) + return false; + + auto istat = hipPointerGetAttributes(&dev_attributes, ptr); + if(istat != hipSuccess) + fmt::print(stderr, "is_device_pointer: istat = {} {}\n", istat, hipGetErrorName(istat)); + + assert(istat == hipSuccess); + return (dev_attributes.type == hipMemoryTypeDevice); +} + #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS // Ensure __assert_fail is declared. #if !__is_identifier(__assert_fail) diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp index 2e4177556..53091092c 100644 --- a/library/src/include/rocsolver_handle.hpp +++ b/library/src/include/rocsolver_handle.hpp @@ -37,6 +37,7 @@ struct rocsolver_handle_data_ rocblas_int checksum; rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu; + rocsolver_alg_mode sterf_mode = rocsolver_alg_mode_gpu; }; typedef struct rocsolver_handle_data_* rocsolver_handle_data; diff --git a/library/src/include/rocsolver_hybrid_array.hpp b/library/src/include/rocsolver_hybrid_array.hpp new file mode 100644 index 000000000..ca0abcf45 --- /dev/null +++ b/library/src/include/rocsolver_hybrid_array.hpp @@ -0,0 +1,272 @@ +/* ************************************************************************** + * Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "common_host_helpers.hpp" +#include "lib_host_helpers.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +/* ROCSOLVER_HYBRID_ARRAY provides a wrapper for data arrays that is intended to simplify host memory + allocation as well as data transfers to and from the device. Hybrid algorithms can use rocsolver_hybrid_array + to read device data to the host so that the data may be operated upon, and then write the resultant data + back to the device. Two modes are supported: + + * Standard mode: Used to read device data onto the host, which may then be operated upon by host code. + A typical workflow will call init_async to allocate a host buffer and read the data into the buffer, + execute host code on the buffered data through the pointer provided by the [] operator, and then call + write_to_device_async to write the resultant data back to the device. + + * Pointers only mode: Used to read device pointers from a batched array onto the host, so that device + kernels may be called on each individual batch instance. A typical workflow will call init_pointers_only + to allocate a host buffer for the device pointers, and then execute device kernels on the pointers + provided by the [] operator. */ +template +struct rocsolver_hybrid_array +{ + I dim, batch_count; + rocblas_stride stride; + + U src_array; + T** batch_array; + T* val_array; + + /* Constructor */ + rocsolver_hybrid_array() + : src_array(nullptr) + , batch_array(nullptr) + , val_array(nullptr) + { + } + + /* Disallow copying. */ + rocsolver_hybrid_array(const rocsolver_hybrid_array&) = delete; + + /* Disallow assigning. */ + rocsolver_hybrid_array& operator=(const rocsolver_hybrid_array&) = delete; + + /* Destructor */ + ~rocsolver_hybrid_array() + { + if(val_array) + free(val_array); + if(batch_array && (val_array || this->dim < 0)) + free(batch_array); + } + + /* Used to read device pointers from a batched array for use on the host; no other data is read from the + device. */ + rocblas_status init_pointers_only(U array, rocblas_stride stride, I batch_count, hipStream_t stream) + { + if(val_array) + free(val_array); + if(batch_array && (val_array || this->dim < 0)) + free(batch_array); + + this->dim = -1; + this->src_array = array; + this->stride = stride; + this->batch_count = batch_count; + + bool constexpr is_strided = (std::is_same::value || std::is_same::value); + bool is_device = is_device_pointer((void*)array); + + if(is_device) + { + // pointers only; don't allocate val_array + val_array = nullptr; + + if(is_strided) + { + // data is strided; batch_array not needed + batch_array = nullptr; + } + else + { + // data is batched; read device pointers into batch_array + size_t batch_bytes = sizeof(T*) * batch_count; + batch_array = (T**)malloc(batch_bytes); + HIP_CHECK( + hipMemcpyAsync(batch_array, array, batch_bytes, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + } + else + { + // data on host; use src_array directly + val_array = nullptr; + + if(is_strided) + batch_array = nullptr; + else + batch_array = (T**)src_array; + } + + return rocblas_status_success; + } + /* Used to read device data into a host buffer, which is managed by this class. Data for all batch instances + will be read into the buffer. */ + rocblas_status init_async(I dim, U array, rocblas_stride stride, I batch_count, hipStream_t stream) + { + if(val_array) + free(val_array); + if(batch_array && (val_array || this->dim < 0)) + free(batch_array); + + if(dim < 0) + return rocblas_status_internal_error; + + this->dim = dim; + this->src_array = array; + this->stride = stride; + this->batch_count = batch_count; + + bool constexpr is_strided = (std::is_same::value || std::is_same::value); + bool is_device = is_device_pointer((void*)array); + + if(is_device) + { + // allocate space on host for data from device + size_t dim_bytes = sizeof(T) * dim; + size_t val_bytes = sizeof(T) * dim * batch_count; + val_array = (T*)malloc(val_bytes); + + if(is_strided) + { + // data is strided; batch_array not needed + batch_array = nullptr; + + // read data to val_array + if(batch_count == 1 || stride == dim) + { + HIP_CHECK(hipMemcpyAsync(val_array, src_array, val_bytes, hipMemcpyDeviceToHost, + stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + HIP_CHECK(hipMemcpyAsync(val_array + bid * dim, src_array + bid * stride, + dim_bytes, hipMemcpyDeviceToHost, stream)); + } + } + } + else + { + // data is batched; read device pointers into batch_array + size_t batch_bytes = sizeof(T*) * batch_count; + batch_array = (T**)malloc(batch_bytes); + HIP_CHECK( + hipMemcpyAsync(batch_array, array, batch_bytes, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // read data to val_array + for(I bid = 0; bid < batch_count; bid++) + { + HIP_CHECK(hipMemcpyAsync(val_array + bid * dim, batch_array[bid], dim_bytes, + hipMemcpyDeviceToHost, stream)); + } + } + } + else + { + // data on host; use src_array directly + val_array = nullptr; + + if(is_strided) + batch_array = nullptr; + else + batch_array = (T**)src_array; + } + + return rocblas_status_success; + } + /* Copies data from host buffer back to the device. Returns an error if initialized for pointers + only. */ + rocblas_status write_to_device_async(hipStream_t stream) + { + if(!src_array) + return rocblas_status_internal_error; + if(dim < 0) + return rocblas_status_internal_error; + + if(val_array) + { + size_t dim_bytes = sizeof(T) * dim; + size_t val_bytes = sizeof(T) * dim * batch_count; + + if(!batch_array) + { + if(batch_count == 1 || stride == dim) + { + HIP_CHECK(hipMemcpyAsync(src_array, val_array, val_bytes, hipMemcpyHostToDevice, + stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + HIP_CHECK(hipMemcpyAsync(src_array + bid * stride, val_array + bid * dim, + dim_bytes, hipMemcpyHostToDevice, stream)); + } + } + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + HIP_CHECK(hipMemcpyAsync(batch_array[bid], val_array + bid * dim, dim_bytes, + hipMemcpyHostToDevice, stream)); + } + } + } + + return rocblas_status_success; + } + + /* Gets a pointer to the data for batch index bid. If initialized for pointers only, the + returned pointer may be on the device. Otherwise, the pointer is on the host. */ + T* operator[](I bid) + { + if(!src_array) + return nullptr; + + if(val_array) + return val_array + bid * dim; + else + { + if(batch_array) + return batch_array[bid]; + else + return (T*)(src_array + bid * stride); + } + } +}; + +ROCSOLVER_END_NAMESPACE