diff --git a/CHANGELOG.md b/CHANGELOG.md
index dcbe0bba5..60cc6c5ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,10 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h
 ## (Unreleased) rocSOLVER
 
 ### Added
+
+* Hybrid computation support for existing routines:
+    - STERF
+
 ### Changed
 ### Removed
 ### Optimized
@@ -22,7 +26,7 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h
 * Algorithm selection mechanism for hybrid computation
 * Hybrid computation support for existing routines:
     - BDSQR
-    - GESVD  
+    - GESVD
 
 ### Optimized
 
diff --git a/clients/common/auxiliary/testing_sterf.hpp b/clients/common/auxiliary/testing_sterf.hpp
index 7585337f5..bc4e35ff4 100644
--- a/clients/common/auxiliary/testing_sterf.hpp
+++ b/clients/common/auxiliary/testing_sterf.hpp
@@ -213,6 +213,19 @@ void testing_sterf(Arguments& argus)
 
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+    {
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     // N/A
 
diff --git a/clients/common/lapack/testing_syev_heev.hpp b/clients/common/lapack/testing_syev_heev.hpp
index 00928585f..e50af29b4 100644
--- a/clients/common/lapack/testing_syev_heev.hpp
+++ b/clients/common/lapack/testing_syev_heev.hpp
@@ -398,6 +398,19 @@ void testing_syev_heev(Arguments& argus)
     rocblas_int bc = argus.batch_count;
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+    {
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal)
     {
diff --git a/clients/gtest/auxiliary/sterf_gtest.cpp b/clients/gtest/auxiliary/sterf_gtest.cpp
index 9ad0cc949..b266f8727 100644
--- a/clients/gtest/auxiliary/sterf_gtest.cpp
+++ b/clients/gtest/auxiliary/sterf_gtest.cpp
@@ -65,7 +65,8 @@ Arguments sterf_setup_arguments(sterf_tuple tup)
     return arg;
 }
 
-class STERF : public ::TestWithParam<sterf_tuple>
+template <rocblas_int MODE>
+class STERF_BASE : public ::TestWithParam<sterf_tuple>
 {
 protected:
     void TearDown() override
@@ -77,6 +78,7 @@ class STERF : public ::TestWithParam<sterf_tuple>
     void run_tests()
     {
         Arguments arg = sterf_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("n") == 0)
             testing_sterf_bad_arg<T>();
@@ -85,6 +87,14 @@ class STERF : public ::TestWithParam<sterf_tuple>
     }
 };
 
+class STERF : public STERF_BASE<0>
+{
+};
+
+class STERF_HYBRID : public STERF_BASE<1>
+{
+};
+
 // non-batch tests
 
 TEST_P(STERF, __float)
@@ -97,6 +107,20 @@ TEST_P(STERF, __double)
     run_tests<double>();
 }
 
+TEST_P(STERF_HYBRID, __float)
+{
+    run_tests<float>();
+}
+
+TEST_P(STERF_HYBRID, __double)
+{
+    run_tests<double>();
+}
+
 INSTANTIATE_TEST_SUITE_P(daily_lapack, STERF, ValuesIn(large_matrix_size_range));
 
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, STERF, ValuesIn(matrix_size_range));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack, STERF_HYBRID, ValuesIn(large_matrix_size_range));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack, STERF_HYBRID, ValuesIn(matrix_size_range));
diff --git a/clients/gtest/lapack/syev_heev_gtest.cpp b/clients/gtest/lapack/syev_heev_gtest.cpp
index 82779d976..f5f5c7829 100644
--- a/clients/gtest/lapack/syev_heev_gtest.cpp
+++ b/clients/gtest/lapack/syev_heev_gtest.cpp
@@ -81,6 +81,7 @@ Arguments syev_heev_setup_arguments(syev_heev_tuple tup)
     return arg;
 }
 
+template <rocblas_int MODE>
 class SYEV_HEEV : public ::TestWithParam<syev_heev_tuple>
 {
 protected:
@@ -93,6 +94,7 @@ class SYEV_HEEV : public ::TestWithParam<syev_heev_tuple>
     void run_tests()
     {
         Arguments arg = syev_heev_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("n") == 0 && arg.peek<char>("evect") == 'N'
            && arg.peek<char>("uplo") == 'L')
@@ -103,11 +105,19 @@ class SYEV_HEEV : public ::TestWithParam<syev_heev_tuple>
     }
 };
 
-class SYEV : public SYEV_HEEV
+class SYEV : public SYEV_HEEV<0>
 {
 };
 
-class HEEV : public SYEV_HEEV
+class HEEV : public SYEV_HEEV<0>
+{
+};
+
+class SYEV_HYBRID : public SYEV_HEEV<1>
+{
+};
+
+class HEEV_HYBRID : public SYEV_HEEV<1>
 {
 };
 
@@ -133,6 +143,26 @@ TEST_P(HEEV, __double_complex)
     run_tests<false, false, rocblas_double_complex>();
 }
 
+TEST_P(SYEV_HYBRID, __float)
+{
+    run_tests<false, false, float>();
+}
+
+TEST_P(SYEV_HYBRID, __double)
+{
+    run_tests<false, false, double>();
+}
+
+TEST_P(HEEV_HYBRID, __float_complex)
+{
+    run_tests<false, false, rocblas_float_complex>();
+}
+
+TEST_P(HEEV_HYBRID, __double_complex)
+{
+    run_tests<false, false, rocblas_double_complex>();
+}
+
 // batched tests
 
 TEST_P(SYEV, batched__float)
@@ -155,6 +185,26 @@ TEST_P(HEEV, batched__double_complex)
     run_tests<true, true, rocblas_double_complex>();
 }
 
+TEST_P(SYEV_HYBRID, batched__float)
+{
+    run_tests<true, true, float>();
+}
+
+TEST_P(SYEV_HYBRID, batched__double)
+{
+    run_tests<true, true, double>();
+}
+
+TEST_P(HEEV_HYBRID, batched__float_complex)
+{
+    run_tests<true, true, rocblas_float_complex>();
+}
+
+TEST_P(HEEV_HYBRID, batched__double_complex)
+{
+    run_tests<true, true, rocblas_double_complex>();
+}
+
 // strided_batched tests
 
 TEST_P(SYEV, strided_batched__float)
@@ -177,13 +227,49 @@ TEST_P(HEEV, strided_batched__double_complex)
     run_tests<false, true, rocblas_double_complex>();
 }
 
+TEST_P(SYEV_HYBRID, strided_batched__float)
+{
+    run_tests<false, true, float>();
+}
+
+TEST_P(SYEV_HYBRID, strided_batched__double)
+{
+    run_tests<false, true, double>();
+}
+
+TEST_P(HEEV_HYBRID, strided_batched__float_complex)
+{
+    run_tests<false, true, rocblas_float_complex>();
+}
+
+TEST_P(HEEV_HYBRID, strided_batched__double_complex)
+{
+    run_tests<false, true, rocblas_double_complex>();
+}
+
 // daily_lapack tests normal execution with medium to large sizes
 INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
 
 INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
 
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         SYEV_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         HEEV_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
+
 // checkin_lapack tests normal execution with small sizes, invalid sizes,
 // quick returns, and corner cases
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEV, Combine(ValuesIn(size_range), ValuesIn(op_range)));
 
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEV, Combine(ValuesIn(size_range), ValuesIn(op_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         SYEV_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(op_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         HEEV_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(op_range)));
diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h
index 3335f8f3d..fd6e1ec82 100644
--- a/library/include/rocsolver/rocsolver-extra-types.h
+++ b/library/include/rocsolver/rocsolver-extra-types.h
@@ -188,6 +188,7 @@ typedef enum rocsolver_function_
 {
     rocsolver_function_bdsqr = 401,
     rocsolver_function_gesvd = 402,
+    rocsolver_function_sterf = 403,
 } rocsolver_function;
 
 #endif /* ROCSOLVER_EXTRA_TYPES_H */
diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index bd9a0592d..12c6ed69e 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -4005,6 +4005,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zbdsqr(rocblas_handle handle,
     The matrix is not represented explicitly, but rather as the array of
     diagonal elements D and the array of symmetric off-diagonal elements E.
 
+    \note
+    A hybrid (CPU+GPU) approach is available for STERF, primarily intended for
+    homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it.
+
     @param[in]
     handle      rocblas_handle.
     @param[in]
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 394aa2339..21e56b410 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -35,6 +35,7 @@
 
 #include "lapack_host_functions.hpp"
 #include "rocauxiliary_lasr.hpp"
+#include "rocsolver_hybrid_array.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
@@ -1341,15 +1342,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    const rocblas_stride strideD,
                                                    S* E,
                                                    const rocblas_stride strideE,
-                                                   W1 V_arg,
+                                                   W1 V,
                                                    const I shiftV,
                                                    const I ldv,
                                                    const rocblas_stride strideV,
-                                                   W2 U_arg,
+                                                   W2 U,
                                                    const I shiftU,
                                                    const I ldu,
                                                    const rocblas_stride strideU,
-                                                   W3 C_arg,
+                                                   W3 C,
                                                    const I shiftC,
                                                    const I ldc,
                                                    const rocblas_stride strideC,
@@ -1358,203 +1359,48 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    I* splits_map,
                                                    S* work)
 {
-    // -------------------------------
-    // lambda expression as helper
-    // -------------------------------
-    auto is_device_pointer = [](void* ptr) -> bool {
-        hipPointerAttribute_t dev_attributes;
-        if(ptr == nullptr)
-        {
-            return (false);
-        }
-
-        auto istat = hipPointerGetAttributes(&dev_attributes, ptr);
-        if(istat != hipSuccess)
-        {
-            std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat)
-                      << std::endl;
-        }
-        assert(istat == hipSuccess);
-        return (dev_attributes.type == hipMemoryTypeDevice);
-    };
-
-    // -------------------------
-    // copy D into hD, E into hE
-    // -------------------------
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
-    W1 V = V_arg;
-    W2 U = U_arg;
-    W3 C = C_arg;
-
-    // handle  batch case with array of pointers on device
-    std::vector<T*> Vp_array(batch_count);
-    std::vector<T*> Up_array(batch_count);
-    std::vector<T*> Cp_array(batch_count);
-
+    // -----------------------------------
+    // transfer arrays from device to host
+    // -----------------------------------
+    rocsolver_hybrid_array<S, I, S*> hD;
+    rocsolver_hybrid_array<S, I, S*> hE;
+    rocsolver_hybrid_array<T, I, W1> hV;
+    rocsolver_hybrid_array<T, I, W2> hU;
+    rocsolver_hybrid_array<T, I, W3> hC;
+    rocsolver_hybrid_array<I, I, I*> hInfo;
+
+    ROCBLAS_CHECK(hD.init_async(n, D, strideD, batch_count, stream));
+    ROCBLAS_CHECK(hE.init_async(n - 1, E, strideE, batch_count, stream));
     if(nv > 0)
-    {
-        bool const is_device_V_arg = is_device_pointer((void*)V_arg);
-        if(is_device_V_arg)
-        {
-            // note "T *" and "T * const" may be considered different types
-            bool constexpr is_array_of_device_pointers
-                = !(std::is_same<W1, T*>::value || std::is_same<W1, T* const>::value);
-            bool constexpr need_copy_W1 = is_array_of_device_pointers;
-            if constexpr(need_copy_W1)
-            {
-                size_t const nbytes = sizeof(T*) * batch_count;
-                void* const dst = (void*)&(Vp_array[0]);
-                void* const src = (void*)V_arg;
-                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
-                HIP_CHECK(hipStreamSynchronize(stream));
-                V = &(Vp_array[0]);
-            }
-        }
-    }
-
+        ROCBLAS_CHECK(hV.init_pointers_only(V + shiftV, strideV, batch_count, stream));
     if(nu > 0)
-    {
-        bool const is_device_U_arg = is_device_pointer((void*)U_arg);
-        if(is_device_U_arg)
-        {
-            bool constexpr is_array_of_device_pointers
-                = !(std::is_same<W2, T*>::value || std::is_same<W2, T* const>::value);
-            bool constexpr need_copy_W2 = is_array_of_device_pointers;
-            if constexpr(need_copy_W2)
-            {
-                size_t const nbytes = sizeof(T*) * batch_count;
-                void* const dst = (void*)&(Up_array[0]);
-                void* const src = (void*)U_arg;
-                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
-                HIP_CHECK(hipStreamSynchronize(stream));
-                U = &(Up_array[0]);
-            }
-        }
-    }
-
+        ROCBLAS_CHECK(hU.init_pointers_only(U + shiftU, strideU, batch_count, stream));
     if(nc > 0)
-    {
-        bool const is_device_C_arg = is_device_pointer((void*)C_arg);
-        if(is_device_C_arg)
-        {
-            bool constexpr is_array_of_device_pointers
-                = !(std::is_same<W3, T*>::value || std::is_same<W3, T* const>::value);
-            bool constexpr need_copy_W3 = is_array_of_device_pointers;
-            if constexpr(need_copy_W3)
-            {
-                size_t const nbytes = sizeof(T*) * batch_count;
-                void* const dst = (void*)&(Cp_array[0]);
-                void* const src = (void*)C_arg;
-                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
-                HIP_CHECK(hipStreamSynchronize(stream));
-                C = &(Cp_array[0]);
-            }
-        }
-    }
-
-    S* hD = nullptr;
-    S* hE = nullptr;
-
-    size_t const E_size = (n - 1);
-    HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n));
-    HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size));
-
-    // ----------------------------------------------------
-    // copy info_array[] on device to linfo_array[] on host
-    // ----------------------------------------------------
-    I* linfo_array = nullptr;
-    HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count)));
-    {
-        void* const dst = &(linfo_array[0]);
-        void* const src = &(info_array[0]);
-        size_t const nbytes = sizeof(I) * batch_count;
-        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-    }
+        ROCBLAS_CHECK(hC.init_pointers_only(C + shiftC, strideC, batch_count, stream));
+    ROCBLAS_CHECK(hInfo.init_async(1, info_array, 1, batch_count, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
 
     S* hwork = nullptr;
     HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n)));
+    S* dwork = nullptr;
+    HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n)));
 
-    // -------------------------------------------------
-    // transfer arrays D(:) and E(:) from Device to Host
-    // -------------------------------------------------
-    bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n);
-    if(use_single_copy_for_D)
-    {
-        void* const dst = (void*)&(hD[0]);
-        void* const src = (void*)&(D[0]);
-        size_t const sizeBytes = sizeof(S) * n * batch_count;
-        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-    }
-    else
-    {
-        for(I bid = 0; bid < batch_count; bid++)
-        {
-            void* const dst = (void*)&(hD[bid * n]);
-            void* const src = (void*)&(D[bid * strideD]);
-            size_t const sizeBytes = sizeof(S) * n;
-            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-        }
-    }
-
-    {
-        for(I bid = 0; bid < batch_count; bid++)
-        {
-            void* const dst = (void*)&(hE[bid * E_size]);
-            void* const src = (void*)&(E[bid * strideE]);
-            size_t const sizeBytes = sizeof(S) * E_size;
-            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-        }
-    }
-
-    // -------------------------------------------------
+    // --------------------------------------
     // Execute for each instance in the batch
-    // -------------------------------------------------
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    S* dwork_ = nullptr;
-    HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n)));
-
+    // --------------------------------------
     for(I bid = 0; bid < batch_count; bid++)
     {
-        if(linfo_array[bid] != 0)
-        {
+        if(hInfo[bid][0] != 0)
             continue;
-        }
 
         char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
-        S* d_ = &(hD[bid * n]);
-        S* e_ = &(hE[bid * E_size]);
-
-        T* v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
-        T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
-        T* c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
-        S* work_ = &(hwork[0]);
-
         I info = 0;
 
-        I nru = nu;
-        I ncc = nc;
-
-        // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
-        // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv"
-        // but it is  actually holding "VT"
-        T* vt_ = v_;
-        I ldvt = ldv;
-
-        I nrv = n;
-        I ncvt = nv;
-        bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
-
-        bdsqr_single_template<S, T, I>(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu,
-                                       c_, ldc, work_, info, dwork_, stream);
+        bdsqr_single_template<S, T, I>(handle, uplo, n, nv, nu, nc, hD[bid], hE[bid], hV[bid], ldv,
+                                       hU[bid], ldu, hC[bid], ldc, hwork, info, dwork, stream);
 
         if(info == 0)
         {
@@ -1563,80 +1409,31 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
             S const zero = S(0);
             for(I i = 0; i < (n - 1); i++)
             {
-                e_[i] = zero;
+                hE[bid][i] = zero;
             }
         }
 
-        if(linfo_array[bid] == 0)
+        if(hInfo[bid][0] == 0)
         {
-            linfo_array[bid] = info;
+            hInfo[bid][0] = info;
         }
     } // end for bid
 
-    // -------------------------------------------------
-    // transfer arrays D(:) and E(:) from host to device
-    // -------------------------------------------------
-    if(use_single_copy_for_D)
-    {
-        void* const src = (void*)&(hD[0]);
-        void* const dst = (void*)D;
-        size_t const sizeBytes = sizeof(S) * n * batch_count;
-        hipMemcpyKind const kind = hipMemcpyHostToDevice;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-    }
-    else
-    {
-        for(I bid = 0; bid < batch_count; bid++)
-        {
-            void* const src = (void*)&(hD[bid * n]);
-            void* const dst = (void*)(D + bid * strideD);
-            size_t const sizeBytes = sizeof(S) * n;
-            hipMemcpyKind const kind = hipMemcpyHostToDevice;
-            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-        }
-    }
-
-    {
-        for(I bid = 0; bid < batch_count; bid++)
-        {
-            void* const src = (void*)&(hE[bid * E_size]);
-            void* const dst = (void*)&(E[bid * strideE]);
-            size_t const sizeBytes = sizeof(S) * E_size;
-            hipMemcpyKind const kind = hipMemcpyHostToDevice;
-            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-        }
-    }
-
-    // ------------------------------------------------------
-    // copy linfo_array[] from host to info_array[] on device
-    // ------------------------------------------------------
-    {
-        void* const src = (void*)&(linfo_array[0]);
-        void* const dst = (void*)&(info_array[0]);
-        size_t const nbytes = sizeof(I) * batch_count;
-        hipMemcpyKind const kind = hipMemcpyHostToDevice;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-    }
-
+    // -----------------------------------
+    // transfer arrays from host to device
+    // -----------------------------------
+    ROCBLAS_CHECK(hD.write_to_device_async(stream));
+    ROCBLAS_CHECK(hE.write_to_device_async(stream));
+    ROCBLAS_CHECK(hInfo.write_to_device_async(stream));
     HIP_CHECK(hipStreamSynchronize(stream));
 
     // ----------------------
     // free allocated storage
     // ----------------------
     HIP_CHECK(hipHostFree(hwork));
-    hwork = nullptr;
-    HIP_CHECK(hipHostFree(hD));
-    hD = nullptr;
-    HIP_CHECK(hipHostFree(hE));
-    hE = nullptr;
-    HIP_CHECK(hipFree(dwork_));
-    dwork_ = nullptr;
-    HIP_CHECK(hipHostFree(linfo_array));
-    linfo_array = nullptr;
-
-    return (rocblas_status_success);
+    HIP_CHECK(hipFree(dwork));
+
+    return rocblas_status_success;
 }
 
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/auxiliary/rocauxiliary_sterf.hpp b/library/src/auxiliary/rocauxiliary_sterf.hpp
index 10922e16b..338ec2327 100644
--- a/library/src/auxiliary/rocauxiliary_sterf.hpp
+++ b/library/src/auxiliary/rocauxiliary_sterf.hpp
@@ -35,6 +35,7 @@
 #include "lapack_device_functions.hpp"
 #include "rocblas.hpp"
 #include "rocsolver/rocsolver.h"
+#include "rocsolver_hybrid_array.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
@@ -46,21 +47,16 @@ ROCSOLVER_BEGIN_NAMESPACE
 
 /** STERF_SQ_E squares the elements of E **/
 template <typename T>
-__device__ void sterf_sq_e(const rocblas_int start, const rocblas_int end, T* E)
+__device__ __host__ void sterf_sq_e(const rocblas_int start, const rocblas_int end, T* E)
 {
     for(int i = start; i < end; i++)
         E[i] = E[i] * E[i];
 }
 
-/** STERF_KERNEL implements the main loop of the sterf algorithm
-    to compute the eigenvalues of a symmetric tridiagonal matrix given by D
-    and E **/
 template <typename T>
-ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
-                                   T* DD,
-                                   const rocblas_stride strideD,
-                                   T* EE,
-                                   const rocblas_stride strideE,
+__device__ __host__ void run_sterf(const rocblas_int n,
+                                   T* D,
+                                   T* E,
                                    rocblas_int* info,
                                    rocblas_int* stack,
                                    const rocblas_int max_iters,
@@ -68,11 +64,6 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
                                    const T ssfmin,
                                    const T ssfmax)
 {
-    rocblas_int bid = hipBlockIdx_x;
-
-    T* D = DD + (bid * strideD);
-    T* E = EE + (bid * strideE);
-
     rocblas_int m, l, lsv, lend, lendsv;
     rocblas_int l1 = 0;
     rocblas_int iters = 0;
@@ -85,7 +76,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
             E[l1 - 1] = 0;
         for(m = l1; m < n - 1; m++)
         {
-            if(abs(E[m]) <= sqrt(abs(D[m])) * sqrt(abs(D[m + 1])) * eps)
+            if(std::abs(E[m]) <= std::sqrt(std::abs(D[m])) * std::sqrt(std::abs(D[m + 1])) * eps)
             {
                 E[m] = 0;
                 break;
@@ -109,7 +100,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
         sterf_sq_e(l, lend, E);
 
         // Choose iteration type (QL or QR)
-        if(abs(D[lend]) < abs(D[l]))
+        if(std::abs(D[lend]) < std::abs(D[l]))
         {
             lend = lsv;
             l = lendsv;
@@ -122,7 +113,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
             {
                 // Find small subdiagonal element
                 for(m = l; m <= lend - 1; m++)
-                    if(abs(E[m]) <= eps * eps * abs(D[m] * D[m + 1]))
+                    if(std::abs(E[m]) <= eps * eps * std::abs(D[m] * D[m + 1]))
                         break;
 
                 if(m < lend)
@@ -137,7 +128,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
                 {
                     // Use lae2 to compute 2x2 eigenvalues. Using rte, rt1, rt2.
                     T rte, rt1, rt2;
-                    rte = sqrt(E[l]);
+                    rte = std::sqrt(E[l]);
                     lae2(D[l], rte, D[l + 1], rt1, rt2);
                     D[l] = rt1;
                     D[l + 1] = rt2;
@@ -153,12 +144,12 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
                     T sigma, gamma, r, rte, c, s;
 
                     // Form shift
-                    rte = sqrt(E[l]);
+                    rte = std::sqrt(E[l]);
                     sigma = (D[l + 1] - p) / (2 * rte);
                     if(sigma >= 0)
-                        r = abs(sqrt(1 + sigma * sigma));
+                        r = std::abs(sqrt(1 + sigma * sigma));
                     else
-                        r = -abs(sqrt(1 + sigma * sigma));
+                        r = -std::abs(sqrt(1 + sigma * sigma));
                     sigma = p - (rte / (sigma + r));
 
                     c = 1;
@@ -198,7 +189,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
             {
                 // Find small subdiagonal element
                 for(m = l; m >= lend + 1; m--)
-                    if(abs(E[m - 1]) <= eps * eps * abs(D[m] * D[m - 1]))
+                    if(std::abs(E[m - 1]) <= eps * eps * std::abs(D[m] * D[m - 1]))
                         break;
 
                 if(m > lend)
@@ -213,7 +204,7 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
                 {
                     // Use lae2 to compute 2x2 eigenvalues. Using rte, rt1, rt2.
                     T rte, rt1, rt2;
-                    rte = sqrt(E[l - 1]);
+                    rte = std::sqrt(E[l - 1]);
                     lae2(D[l], rte, D[l - 1], rt1, rt2);
                     D[l] = rt1;
                     D[l - 1] = rt2;
@@ -229,12 +220,12 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
                     T sigma, gamma, r, rte, c, s;
 
                     // Form shift. Using rte, r, c, s.
-                    rte = sqrt(E[l - 1]);
+                    rte = std::sqrt(E[l - 1]);
                     sigma = (D[l - 1] - p) / (2 * rte);
                     if(sigma >= 0)
-                        r = abs(sqrt(1 + sigma * sigma));
+                        r = std::abs(std::sqrt(1 + sigma * sigma));
                     else
-                        r = -abs(sqrt(1 + sigma * sigma));
+                        r = -std::abs(std::sqrt(1 + sigma * sigma));
                     sigma = p - (rte / (sigma + r));
 
                     c = 1;
@@ -277,14 +268,14 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
     // Check for convergence
     for(int i = 0; i < n - 1; i++)
         if(E[i] != 0)
-            info[bid]++;
+            (*info)++;
 
     // Sort eigenvalues
     /** (TODO: the quick-sort method implemented in lasrt_increasing fails for some cases.
         Substituting it here with a simple sorting algorithm. If more performance is required in
         the future, lasrt_increasing should be debugged or another quick-sort method
         could be implemented) **/
-    //lasrt_increasing(n, D, stack + bid * (2 * 32));
+    //lasrt_increasing(n, D, stack);
 
     for(int ii = 1; ii < n; ii++)
     {
@@ -307,6 +298,32 @@ ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
     }
 }
 
+/** STERF_KERNEL implements the main loop of the sterf algorithm
+    to compute the eigenvalues of a symmetric tridiagonal matrix given by D
+    and E **/
+template <typename T>
+ROCSOLVER_KERNEL void sterf_kernel(const rocblas_int n,
+                                   T* DD,
+                                   const rocblas_stride strideD,
+                                   T* EE,
+                                   const rocblas_stride strideE,
+                                   rocblas_int* infoA,
+                                   rocblas_int* stackA,
+                                   const rocblas_int max_iters,
+                                   const T eps,
+                                   const T ssfmin,
+                                   const T ssfmax)
+{
+    rocblas_int bid = hipBlockIdx_x;
+
+    T* D = DD + (bid * strideD);
+    T* E = EE + (bid * strideE);
+    rocblas_int* info = infoA + bid;
+    rocblas_int* stack = stackA + bid * (2 * 32);
+
+    run_sterf<T>(n, D, E, info, stack, max_iters, eps, ssfmin, ssfmax);
+}
+
 template <typename T>
 void rocsolver_sterf_getMemorySize(const rocblas_int n,
                                    const rocblas_int batch_count,
@@ -368,6 +385,9 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle,
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
+    rocsolver_alg_mode alg_mode;
+    ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode));
+
     rocblas_int blocksReset = (batch_count - 1) / BS1 + 1;
     dim3 gridReset(blocksReset, 1, 1);
     dim3 threads(BS1, 1, 1);
@@ -385,8 +405,33 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle,
     ssfmin = sqrt(ssfmin) / (eps * eps);
     ssfmax = sqrt(ssfmax) / T(3.0);
 
-    ROCSOLVER_LAUNCH_KERNEL(sterf_kernel<T>, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD,
-                            strideD, E + shiftE, strideE, info, stack, 30 * n, eps, ssfmin, ssfmax);
+    if(alg_mode == rocsolver_alg_mode_hybrid)
+    {
+        rocsolver_hybrid_array<T, rocblas_int, U> hD;
+        rocsolver_hybrid_array<T, rocblas_int, U> hE;
+        rocsolver_hybrid_array<rocblas_int, rocblas_int, rocblas_int*> hInfo;
+
+        ROCBLAS_CHECK(hD.init_async(n, D + shiftD, strideD, batch_count, stream));
+        ROCBLAS_CHECK(hE.init_async(n - 1, E + shiftE, strideE, batch_count, stream));
+        ROCBLAS_CHECK(hInfo.init_async(1, info, 1, batch_count, stream));
+        HIP_CHECK(hipStreamSynchronize(stream));
+
+        for(rocblas_int b = 0; b < batch_count; b++)
+        {
+            run_sterf<T>(n, hD[b], hE[b], hInfo[b], nullptr, 30 * n, eps, ssfmin, ssfmax);
+        }
+
+        ROCBLAS_CHECK(hD.write_to_device_async(stream));
+        ROCBLAS_CHECK(hE.write_to_device_async(stream));
+        ROCBLAS_CHECK(hInfo.write_to_device_async(stream));
+        HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    else
+    {
+        ROCSOLVER_LAUNCH_KERNEL(sterf_kernel<T>, dim3(batch_count), dim3(1), 0, stream, n,
+                                D + shiftD, strideD, E + shiftE, strideE, info, stack, 30 * n, eps,
+                                ssfmin, ssfmax);
+    }
 
     return rocblas_status_success;
 }
diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp
index 949d60fd2..b3680e5b7 100644
--- a/library/src/common/rocsolver_handle.cpp
+++ b/library/src/common/rocsolver_handle.cpp
@@ -66,6 +66,12 @@ rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle,
             handle_data->bdsqr_mode = mode;
             return rocblas_status_success;
         }
+    case rocsolver_function_sterf:
+        if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid)
+        {
+            handle_data->sterf_mode = mode;
+            return rocblas_status_success;
+        }
     }
 
     return rocblas_status_invalid_value;
@@ -95,6 +101,7 @@ rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle,
         {
         case rocsolver_function_gesvd:
         case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break;
+        case rocsolver_function_sterf: *mode = handle_data->sterf_mode; break;
         default: return rocblas_status_invalid_value;
         }
     }
diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index 6e4760c44..e766596ef 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -363,7 +363,7 @@ __device__ void lasr(const rocblas_side side,
     [ a b ]
     [ b c ] **/
 template <typename T, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
-__device__ void lae2(T& a, T& b, T& c, T& rt1, T& rt2)
+__device__ __host__ void lae2(T& a, T& b, T& c, T& rt1, T& rt2)
 {
     T sm = a + c;
     T adf = abs(a - c);
diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp
index dc011f03a..08155d0e0 100644
--- a/library/src/include/lib_device_helpers.hpp
+++ b/library/src/include/lib_device_helpers.hpp
@@ -98,7 +98,7 @@ __device__ void
 /** FIND_MAX_TRIDIAG finds the element with the largest magnitude in the
     tridiagonal matrix **/
 template <typename T>
-__device__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E)
+__device__ __host__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E)
 {
     T anorm = abs(D[end]);
     for(int i = start; i < end; i++)
@@ -109,7 +109,8 @@ __device__ T find_max_tridiag(const rocblas_int start, const rocblas_int end, T*
 /** SCALE_TRIDIAG scales the elements of the tridiagonal matrix by a given
     scale factor **/
 template <typename T>
-__device__ void scale_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E, T scale)
+__device__ __host__ void
+    scale_tridiag(const rocblas_int start, const rocblas_int end, T* D, T* E, T scale)
 {
     D[end] *= scale;
     for(int i = start; i < end; i++)
@@ -119,1122 +120,1124 @@ __device__ void scale_tridiag(const rocblas_int start, const rocblas_int end, T*
     }
 }
 
-// **********************************************************
-// GPU kernels that are used by many rocsolver functions
-// **********************************************************
-
-enum copymat_direction
-{
-    copymat_to_buffer,
-    copymat_from_buffer
-};
-
-/** A mask that is always true. Typically used to make the mask optional,
-    by acting as the default when no other mask is provided. **/
-struct no_mask
-{
-    __device__ constexpr bool operator[](rocblas_int) const noexcept
-    {
-        return true;
-    }
-};
-
-/** An mask defined by an integer array (e.g., the info array) **/
-struct info_mask
+template <typename S, typename I>
+__device__ static void shell_sort_ascending(const I n, S* a, I* map = nullptr)
 {
-    enum mask_transform
+    // -----------------------------------------------
+    // Sort array a[0...(n-1)] and generate permutation vector
+    // in map[] if map[] is available
+    // Note: performs in a single thread block
+    // -----------------------------------------------
+    if((n <= 0) || (a == nullptr))
     {
-        none,
-        negate
+        return;
     };
 
-    explicit constexpr info_mask(rocblas_int* mask, mask_transform transform = none) noexcept
-        : m_mask(mask)
-        , m_negate(transform == negate)
-    {
-    }
-
-    __device__ constexpr bool operator[](rocblas_int idx) const noexcept
-    {
-        return m_negate ^ !!m_mask[idx];
-    }
-
-    rocblas_int* m_mask;
-    bool m_negate;
-};
-
-/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer
-    An optional mask can be provided to limit the copy to selected matricies in the batch
-    If uplo = rocblas_fill_upper, only the upper triangular part is copied
-    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
-template <typename T, typename U, typename Mask = no_mask>
-ROCSOLVER_KERNEL void copy_mat(copymat_direction direction,
-                               const rocblas_int m,
-                               const rocblas_int n,
-                               U A,
-                               const rocblas_int shiftA,
-                               const rocblas_int lda,
-                               const rocblas_stride strideA,
-                               T* buffer,
-                               const Mask mask = no_mask{},
-                               const rocblas_fill uplo = rocblas_fill_full,
-                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
-{
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-    const rocblas_int ldb = m;
-    const rocblas_stride strideB = rocblas_stride(ldb) * n;
-
-    const bool copy = mask[b];
-
-    const bool full = (uplo == rocblas_fill_full);
-    const bool upper = (uplo == rocblas_fill_upper);
-    const bool lower = (uplo == rocblas_fill_lower);
-    const bool cdiag = (diag == rocblas_diagonal_non_unit);
-
-    if(i < m && j < n && copy)
-    {
-        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
-        {
-            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-            T* Bp = &buffer[b * strideB];
+    bool const is_root_thread
+        = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0);
 
-            if(direction == copymat_to_buffer)
-                Bp[i + j * ldb] = Ap[i + j * lda];
-            else // direction == copymat_from_buffer
-                Ap[i + j * lda] = Bp[i + j * ldb];
-        }
-    }
-}
+    int constexpr ngaps = 8;
+    int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence
 
-/** COPY_MAT copies m-by-n array A into B
-    An optional mask can be provided to limit the copy to selected matricies in the batch
-    If uplo = rocblas_fill_upper, only the upper triangular part is copied
-    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
-template <typename T, typename U1, typename U2, typename Mask = no_mask>
-ROCSOLVER_KERNEL void copy_mat(const rocblas_int m,
-                               const rocblas_int n,
-                               U1 A,
-                               const rocblas_int shiftA,
-                               const rocblas_int lda,
-                               const rocblas_stride strideA,
-                               U2 B,
-                               const rocblas_int shiftB,
-                               const rocblas_int ldb,
-                               const rocblas_stride strideB,
-                               const Mask mask = no_mask{},
-                               const rocblas_fill uplo = rocblas_fill_full,
-                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
-{
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
+        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
 
-    const bool copy = mask[b];
+    auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
 
-    const bool full = (uplo == rocblas_fill_full);
-    const bool upper = (uplo == rocblas_fill_upper);
-    const bool lower = (uplo == rocblas_fill_lower);
-    const bool cdiag = (diag == rocblas_diagonal_non_unit);
+    bool const has_map = (map != nullptr);
 
-    if(i < m && j < n && copy)
+    __syncthreads();
+    if(has_map)
     {
-        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
+        for(auto k = k_start; k < n; k += k_inc)
         {
-            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-            T* Bp = load_ptr_batch<T>(B, b, shiftB, strideB);
-
-            Bp[i + j * ldb] = Ap[i + j * lda];
-        }
-    }
-}
-
-/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer
-    If uplo = rocblas_fill_upper, only the upper triangular part is copied
-    If uplo = rocblas_fill_lower, only the lower triangular part is copied
-    Only valid when A is complex and buffer real. If REAL, only works with real part of A;
-    if !REAL only works with imaginary part of A**/
-template <typename T, typename S, bool REAL, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
-ROCSOLVER_KERNEL void copy_mat(copymat_direction direction,
-                               const rocblas_int m,
-                               const rocblas_int n,
-                               U A,
-                               const rocblas_int shiftA,
-                               const rocblas_int lda,
-                               const rocblas_stride strideA,
-                               S* buffer,
-                               const rocblas_fill uplo = rocblas_fill_full,
-                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
-{
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-    const rocblas_int ldb = m;
-    const rocblas_stride strideB = rocblas_stride(ldb) * n;
-
-    const bool lower = (uplo == rocblas_fill_lower);
-    const bool full = (uplo == rocblas_fill_full);
-    const bool upper = (uplo == rocblas_fill_upper);
-    const bool cdiag = (diag == rocblas_diagonal_non_unit);
+            map[k] = k;
+        };
+    };
+    __syncthreads();
 
-    if(i < m && j < n)
+    if(is_root_thread)
     {
-        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
+        for(auto igap = 0; igap < ngaps; igap++)
         {
-            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-            S* Bp = &buffer[b * strideB];
-
-            if(direction == copymat_to_buffer)
-                Bp[i + j * ldb] = REAL ? Ap[i + j * lda].real() : Ap[i + j * lda].imag();
-            else if(REAL)
-                Ap[i + j * lda] = rocblas_complex_num<S>(Bp[i + j * ldb], Ap[i + j * lda].imag());
-            else
-                Ap[i + j * lda] = rocblas_complex_num<S>(Ap[i + j * lda].real(), Bp[i + j * ldb]);
-        }
-    }
-}
-
-/** COPY_TRANS_MAT copies m-by-n array A into B and transpose it depending on the value of trans.
-    An optional mask can be provided to limit the copy to selected matricies in the batch
-    If uplo = rocblas_fill_upper, only the upper triangular part is copied
-    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
-template <typename T1, typename T2, typename Mask = no_mask>
-ROCSOLVER_KERNEL void copy_trans_mat(const rocblas_operation trans,
-                                     const rocblas_int m,
-                                     const rocblas_int n,
-                                     T1* A,
-                                     const rocblas_int shiftA,
-                                     const rocblas_int lda,
-                                     const rocblas_stride strideA,
-                                     T2* B,
-                                     const rocblas_int shiftB,
-                                     const rocblas_int ldb,
-                                     const rocblas_stride strideB,
-                                     const Mask mask = no_mask{},
-                                     const rocblas_fill uplo = rocblas_fill_full,
-                                     const rocblas_diagonal diag = rocblas_diagonal_non_unit)
-{
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-    const bool copy = mask[b];
-
-    const bool full = (uplo == rocblas_fill_full);
-    const bool upper = (uplo == rocblas_fill_upper);
-    const bool lower = (uplo == rocblas_fill_lower);
-    const bool cdiag = (diag == rocblas_diagonal_non_unit);
+            auto const gap = gaps[igap];
 
-    if(i < m && j < n && copy)
-    {
-        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
-        {
-            T1* Ap = load_ptr_batch<T1>(A, b, shiftA, strideA);
-            T2* Bp = load_ptr_batch<T2>(B, b, shiftB, strideB);
+            for(I i = gap; i < n; i += 1)
+            {
+                {
+                    S const temp = a[i];
+                    auto const itemp = (has_map) ? map[i] : 0;
 
-            if(trans == rocblas_operation_conjugate_transpose)
-                Bp[j + i * ldb] = T2(conj(Ap[i + j * lda]));
-            else if(trans == rocblas_operation_transpose)
-                Bp[j + i * ldb] = T2(Ap[i + j * lda]);
-            else
-                Bp[i + j * ldb] = T2(Ap[i + j * lda]);
-        }
-    }
-}
+                    auto j = i;
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void init_ident(const rocblas_int m,
-                                 const rocblas_int n,
-                                 U A,
-                                 const rocblas_int shiftA,
-                                 const rocblas_int lda,
-                                 const rocblas_stride strideA)
-{
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto b = hipBlockIdx_z;
+                    while((j >= gap) && (a[j - gap] > temp))
+                    {
+                        a[j] = a[j - gap];
+                        if(has_map)
+                        {
+                            map[j] = map[j - gap];
+                        };
+                        j -= gap;
+                    };
 
-    if(i < m && j < n)
+                    a[j] = temp;
+                    if(has_map)
+                    {
+                        map[j] = itemp;
+                    };
+                };
+            };
+        };
+    };
+    __syncthreads();
+#ifdef NDEBUG
+#else
+    if(n >= 2)
     {
-        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
-
-        if(i == j)
-            a[i + j * lda] = 1.0;
-        else
-            a[i + j * lda] = 0.0;
-    }
+        for(auto k = k_start; k < (n - 1); k += k_inc)
+        {
+            assert(a[k] <= a[k + 1]);
+        };
+    };
+    __syncthreads();
+#endif
 }
 
-template <typename T, typename I, typename U>
-ROCSOLVER_KERNEL void reset_info(T* info, const I n, U val, I incr = 0)
+template <typename S, typename I>
+__device__ static void shell_sort_descending(const I n, S* a, I* map = nullptr)
 {
-    I idx = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
+    // -----------------------------------------------
+    // Sort array a[0...(n-1)] and generate permutation vector
+    // in map[] if map[] is available
+    // Note: performs in a single thread block
+    // -----------------------------------------------
 
-    if(idx < n)
-        info[idx] = T(val) + incr * idx;
-}
+    if((n <= 0) || (a == nullptr))
+    {
+        return;
+    };
 
-template <typename T, typename I, typename S, typename U>
-ROCSOLVER_KERNEL void reset_batch_info(U info, const rocblas_stride stride, const I n, S val)
-{
-    I idx = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
-    I b = hipBlockIdx_y;
+    bool const is_root_thread
+        = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0);
 
-    T* inf = load_ptr_batch<T>(info, b, 0, stride);
-    if(idx < n)
-        inf[idx] = T(val);
-}
+    int constexpr ngaps = 8;
+    int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence
 
-template <typename T, typename I>
-ROCSOLVER_KERNEL void get_array(T** out, T* in, rocblas_stride stride, I batch)
-{
-    I b = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
+    auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
+        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
 
-    if(b < batch)
-        out[b] = in + b * stride;
-}
+    auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
 
-template <typename T, typename I, typename U>
-ROCSOLVER_KERNEL void shift_array(T** out, U in, rocblas_stride shift, I batch)
-{
-    I b = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
+    bool const has_map = (map != nullptr);
 
-    if(b < batch)
-        out[b] = in[b] + shift;
-}
+    __syncthreads();
+    if(has_map)
+    {
+        for(auto k = k_start; k < n; k += k_inc)
+        {
+            map[k] = k;
+        };
+    };
+    __syncthreads();
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void subtract_tau(const rocblas_int i,
-                                   const rocblas_int j,
-                                   U A,
-                                   const rocblas_int shiftA,
-                                   const rocblas_int lda,
-                                   const rocblas_stride strideA,
-                                   T* ipiv,
-                                   const rocblas_stride strideP)
-{
-    const auto b = hipBlockIdx_x;
-    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-    T* tau = ipiv + b * strideP;
+    if(is_root_thread)
+    {
+        for(auto igap = 0; igap < ngaps; igap++)
+        {
+            auto const gap = gaps[igap];
 
-    T t = -(*tau);
-    *tau = t;
-    Ap[i + j * lda] = 1.0 + t;
-}
+            for(rocblas_int i = gap; i < n; i += 1)
+            {
+                {
+                    S const temp = a[i];
+                    auto const itemp = (has_map) ? map[i] : 0;
 
-template <typename T>
-ROCSOLVER_KERNEL void restau(const rocblas_int k, T* ipiv, const rocblas_stride strideP)
-{
-    const auto blocksizex = hipBlockDim_x;
-    const auto b = hipBlockIdx_y;
-    T* tau = ipiv + b * strideP;
-    const auto i = hipBlockIdx_x * blocksizex + hipThreadIdx_x;
+                    auto j = i;
 
-    if(i < k)
-        tau[i] = -tau[i];
+                    while((j >= gap) && (a[j - gap] < temp))
+                    {
+                        a[j] = a[j - gap];
+                        if(has_map)
+                        {
+                            map[j] = map[j - gap];
+                        };
+                        j -= gap;
+                    };
+
+                    a[j] = temp;
+                    if(has_map)
+                    {
+                        map[j] = itemp;
+                    };
+                };
+            };
+        };
+    };
+    __syncthreads();
+#ifdef NDEBUG
+#else
+    if(n >= 2)
+    {
+        for(auto k = k_start; k < (n - 1); k += k_inc)
+        {
+            assert(a[k] >= a[k + 1]);
+        };
+    };
+    __syncthreads();
+#endif
 }
 
-template <typename T,
-          typename I,
-          typename S,
-          typename U,
-          std::enable_if_t<!rocblas_is_complex<T> || rocblas_is_complex<S>, int> = 0>
-ROCSOLVER_KERNEL void set_diag(S* D,
-                               const rocblas_stride shiftd,
-                               const rocblas_stride strided,
-                               U A,
-                               const rocblas_stride shifta,
-                               const I lda,
-                               const rocblas_stride stridea,
-                               const I n,
-                               bool set_one)
+template <typename S, typename I>
+__device__ static void shell_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true)
 {
-    I b = hipBlockIdx_x;
-    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    I j = i + i * lda;
-
-    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
-    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
+    if((n <= 0) || (a == nullptr))
+    {
+        return;
+    };
 
-    if(i < n)
+    if(is_ascending)
     {
-        d[i] = a[j];
-        a[j] = set_one ? T(1) : a[j];
+        shell_sort_ascending(n, a, map);
     }
+    else
+    {
+        shell_sort_descending(n, a, map);
+    };
 }
 
-template <typename T,
-          typename I,
-          typename S,
-          typename U,
-          std::enable_if_t<rocblas_is_complex<T> && !rocblas_is_complex<S>, int> = 0>
-ROCSOLVER_KERNEL void set_diag(S* D,
-                               const rocblas_stride shiftd,
-                               const rocblas_stride strided,
-                               U A,
-                               const rocblas_stride shifta,
-                               const I lda,
-                               const rocblas_stride stridea,
-                               const I n,
-                               bool set_one)
+template <typename S, typename I>
+__device__ static void selection_sort_ascending(const I n, S* D, I* map = nullptr)
 {
-    I b = hipBlockIdx_x;
-    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    I j = i + i * lda;
+    // ---------------------------------------------------
+    // Sort entries in D[0...(n-1)]
+    // generates permutation vector in map[] if map[] is available
+    // Note: performs in a single thread block
+    // ---------------------------------------------------
+    if((n <= 0) || (D == nullptr))
+    {
+        return;
+    };
 
-    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
-    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
+    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
+        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
 
-    if(i < n)
-    {
-        d[i] = a[j].real();
-        a[j] = set_one ? T(1) : a[j];
-    }
-}
+    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
 
-template <typename T, typename I, typename S, typename U>
-ROCSOLVER_KERNEL void restore_diag(S* D,
-                                   const rocblas_stride shiftd,
-                                   const rocblas_stride strided,
-                                   U A,
-                                   const rocblas_stride shifta,
-                                   const I lda,
-                                   const rocblas_stride stridea,
-                                   const I n)
-{
-    I b = hipBlockIdx_x;
-    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    I j = i + i * lda;
+    auto const k_start = tid;
+    auto const k_inc = nthreads;
+    bool const is_root_thread = (tid == 0);
 
-    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
-    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
+    bool const has_map = (map != nullptr);
 
-    if(i < n)
-        a[j] = d[i];
-}
+    __syncthreads();
 
-/** SET_ZERO inserts zeros in all the entries of a m-by-n matrix A.
-    If uplo = lower, the lower triangular part of A is kept unchanged.
-    If uplo = upper, the upper triangular part of A is kept unchanged **/
-template <typename T, typename U>
-ROCSOLVER_KERNEL void set_zero(const rocblas_int m,
-                               const rocblas_int n,
-                               U A,
-                               const rocblas_int shiftA,
-                               const rocblas_int lda,
-                               const rocblas_stride strideA,
-                               const rocblas_fill uplo = rocblas_fill_full)
-{
-    const auto b = hipBlockIdx_z;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const bool lower = (uplo == rocblas_fill_lower);
-    const bool full = (uplo == rocblas_fill_full);
-    const bool upper = (uplo == rocblas_fill_upper);
+    if(has_map)
+    {
+        for(auto k = k_start; k < n; k += k_inc)
+        {
+            map[k] = k;
+        };
+    };
+
+    __syncthreads();
 
-    if(i < m && j < n)
+    if(is_root_thread)
     {
-        if(full || (lower && j > i) || (upper && i > j))
+        for(I ii = 1; ii < n; ii++)
         {
-            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-            Ap[i + j * lda] = 0.0;
+            auto l = ii - 1;
+            auto m = l;
+            auto p = D[l];
+            auto ip = (has_map) ? map[l] : 0;
+
+            for(auto j = ii; j < n; j++)
+            {
+                if(D[j] < p)
+                {
+                    m = j;
+                    p = D[j];
+                    if(has_map)
+                    {
+                        ip = map[j];
+                    };
+                }
+            }
+            if(m != l)
+            {
+                D[m] = D[l];
+                D[l] = p;
+
+                if(has_map)
+                {
+                    map[m] = map[l];
+                    map[l] = ip;
+                };
+            }
         }
     }
+    __syncthreads();
+#ifdef NDEBUG
+#else
+    if(n >= 2)
+    {
+        for(auto k = k_start; k < (n - 1); k += k_inc)
+        {
+            assert(D[k] <= D[k + 1]);
+        };
+    };
+    __syncthreads();
+#endif
 }
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void copyshift_right(const bool copy,
-                                      const rocblas_int dim,
-                                      U A,
-                                      const rocblas_int shiftA,
-                                      const rocblas_int lda,
-                                      const rocblas_stride strideA,
-                                      T* W,
-                                      const rocblas_int shiftW,
-                                      const rocblas_int ldw,
-                                      const rocblas_stride strideW)
+template <typename S, typename I>
+__device__ static void selection_sort_descending(const I n, S* D, I* map = nullptr)
 {
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    // ---------------------------------------------------
+    // Sort entries in D[0...(n-1)]
+    // generates permutation vector in map[] if map[] is available
+    // Note: performs in a single thread block
+    // ---------------------------------------------------
+    if((n <= 0) || (D == nullptr))
+    {
+        return;
+    };
 
-    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
+    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
+        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
 
-    // make first row the identity
-    if(i == 0 && j == 0 && !copy)
-        Ap[0] = 1.0;
+    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
 
-    if(i < dim && j < dim && j <= i)
+    auto const k_start = tid;
+    auto const k_inc = nthreads;
+    bool const is_root_thread = (tid == 0);
+
+    bool const has_map = (map != nullptr);
+
+    __syncthreads();
+
+    if(has_map)
     {
-        rocblas_int offset = j * (j + 1) / 2; // to acommodate in smaller array W
+        for(auto k = k_start; k < n; k += k_inc)
+        {
+            map[k] = k;
+        };
+    };
 
-        if(copy)
+    __syncthreads();
+
+    if(is_root_thread)
+    {
+        for(I ii = 1; ii < n; ii++)
         {
-            // copy columns
-            Wp[i + j * ldw - offset] = (j == 0 ? 0.0 : Ap[i + 1 + (j - 1) * lda]);
+            auto l = ii - 1;
+            auto m = l;
+            auto p = D[l];
+            auto ip = (has_map) ? map[l] : 0;
+
+            for(auto j = ii; j < n; j++)
+            {
+                if(D[j] > p)
+                {
+                    m = j;
+                    p = D[j];
+                    if(has_map)
+                    {
+                        ip = map[j];
+                    };
+                }
+            }
+            if(m != l)
+            {
+                D[m] = D[l];
+                D[l] = p;
+
+                if(has_map)
+                {
+                    map[m] = map[l];
+                    map[l] = ip;
+                };
+            }
         }
-        else
+    }
+    __syncthreads();
+#ifdef NDEBUG
+#else
+    if(n >= 2)
+    {
+        for(auto k = k_start; k < (n - 1); k += k_inc)
         {
-            // shift columns to the right
-            Ap[i + 1 + j * lda] = Wp[i + j * ldw - offset];
+            assert(D[k] >= D[k + 1]);
+        };
+    };
+    __syncthreads();
+#endif
+}
 
-            // make first row the identity
-            if(i == j)
-                Ap[(j + 1) * lda] = 0.0;
-        }
+template <typename S, typename I>
+__device__ static void selection_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true)
+{
+    if((n <= 0) || (a == nullptr))
+    {
+        return;
+    };
+
+    if(is_ascending)
+    {
+        selection_sort_ascending(n, a, map);
     }
+    else
+    {
+        selection_sort_descending(n, a, map);
+    };
 }
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void copyshift_left(const bool copy,
-                                     const rocblas_int dim,
-                                     U A,
-                                     const rocblas_int shiftA,
-                                     const rocblas_int lda,
-                                     const rocblas_stride strideA,
-                                     T* W,
-                                     const rocblas_int shiftW,
-                                     const rocblas_int ldw,
-                                     const rocblas_stride strideW)
+template <typename T, typename I>
+__device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev = -1)
 {
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    // --------------------------------------------
+    // perform swaps to implement permutation vector
+    // the permutation vector will be restored to the
+    // identity permutation 0,1,2,...
+    //
+    // Note: performs in a thread block
+    // --------------------------------------------
+    {
+        bool const has_work = (n >= 1) && (C != nullptr) && (ldc >= 1) && (map != nullptr);
+        if(!has_work)
+        {
+            return;
+        }
+    }
 
-    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
+    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
+        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
 
-    // make last row the identity
-    if(i == 0 && j == 0 && !copy)
-        Ap[dim + dim * lda] = 1.0;
+    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
 
-    if(i < dim && j < dim && i <= j)
+    auto const k_start = tid;
+    auto const k_inc = nthreads;
+
+    bool const is_root_thread = (tid == 0);
+
+    auto const nn = (nev >= 0) ? nev : n;
+
+    for(I i = 0; i < nn; i++)
     {
-        rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W
+        __syncthreads();
 
-        if(copy)
-        {
-            // copy columns
-            Wp[i + j * ldw - offset] = (j == dim - 1 ? 0.0 : Ap[i + (j + 2) * lda]);
-        }
-        else
+        while(map[i] != i)
         {
-            // shift columns to the left
-            Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset];
-
-            // make last row the identity
-            if(i == j)
-                Ap[dim + j * lda] = 0.0;
-        }
-    }
-}
+            auto const map_i = map[i];
+            auto const map_ii = map[map[i]];
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void copyshift_down(const bool copy,
-                                     const rocblas_int dim,
-                                     U A,
-                                     const rocblas_int shiftA,
-                                     const rocblas_int lda,
-                                     const rocblas_stride strideA,
-                                     T* W,
-                                     const rocblas_int shiftW,
-                                     const rocblas_int ldw,
-                                     const rocblas_stride strideW)
-{
-    const auto b = hipBlockIdx_z;
-    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+            __syncthreads();
 
-    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
-    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
+            if(is_root_thread)
+            {
+                map[map_i] = map_i;
+                map[i] = map_ii;
+            }
 
-    // make first column the identity
-    if(i == 0 && j == 0 && !copy)
-        Ap[0] = 1.0;
+            __syncthreads();
 
-    if(i < dim && j < dim && i <= j)
-    {
-        rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W
+            // ---------------------------------------
+            // swap columns C( 0:(n-1),map_i) and C( 0:(n-1),map_ii)
+            // ---------------------------------------
 
-        if(copy)
-        {
-            // copy rows
-            Wp[i + j * ldw - offset] = (i == 0 ? 0.0 : Ap[i - 1 + (j + 1) * lda]);
-        }
-        else
-        {
-            // shift rows downward
-            Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset];
+            for(auto k = k_start; k < n; k += k_inc)
+            {
+                auto const k_map_i = k + (map_i * static_cast<int64_t>(ldc));
+                auto const k_map_ii = k + (map_ii * static_cast<int64_t>(ldc));
 
-            // make first column the identity
-            if(i == j)
-                Ap[i + 1] = 0.0;
+                auto const ctemp = C[k_map_i];
+                C[k_map_i] = C[k_map_ii];
+                C[k_map_ii] = ctemp;
+            }
+
+            __syncthreads();
         }
     }
-}
-
-/** set_offdiag kernel copies the off-diagonal element of A, which is the non-zero element
-    resulting by applying the Householder reflector to the working column, to E. Then set it
-    to 1 to prepare for the application of the Householder reflector to the rest of the matrix **/
-template <typename T, typename S, typename U, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
-ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count,
-                                  U A,
-                                  const rocblas_int shiftA,
-                                  const rocblas_stride strideA,
-                                  S* E,
-                                  const rocblas_stride strideE)
-{
-    rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-    if(b < batch_count)
+#ifdef NDEBUG
+#else
+    // ----------------------------------------------------------
+    // extra check that map[] is restored to identity permutation
+    // ----------------------------------------------------------
+    __syncthreads();
+    for(auto k = k_start; k < nn; k += k_inc)
     {
-        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
-        S* e = E + b * strideE;
-
-        e[0] = a[0];
-        a[0] = T(1);
+        assert(map[k] == k);
     }
+    __syncthreads();
+#endif
 }
 
-template <typename T, typename S, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
-ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count,
-                                  U A,
-                                  const rocblas_int shiftA,
-                                  const rocblas_stride strideA,
-                                  S* E,
-                                  const rocblas_stride strideE)
+// **********************************************************
+// GPU kernels that are used by many rocsolver functions
+// **********************************************************
+
+enum copymat_direction
 {
-    rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    copymat_to_buffer,
+    copymat_from_buffer
+};
 
-    if(b < batch_count)
+/** A mask that is always true. Typically used to make the mask optional, by acting as the default when
+    no other mask is provided. **/
+struct no_mask
+{
+    __device__ constexpr bool operator[](rocblas_int) const noexcept
     {
-        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
-        S* e = E + b * strideE;
-
-        e[0] = a[0].real();
-        a[0] = T(1);
+        return true;
     }
-}
+};
 
-/** scale_axpy kernel executes axpy to update tau computing the scalar alpha with other
-    results in different memopry locations **/
-template <typename T, typename U>
-ROCSOLVER_KERNEL void scale_axpy(const rocblas_int n,
-                                 T* scl,
-                                 T* S,
-                                 const rocblas_stride strideS,
-                                 U A,
-                                 const rocblas_int shiftA,
-                                 const rocblas_stride strideA,
-                                 T* W,
-                                 const rocblas_int shiftW,
-                                 const rocblas_stride strideW)
+/** An mask defined by an integer array (e.g., the info array). By default, a non-zero value for the ith integer
+    indicates that the data for batch i should be copied over. Data will not be copied if the mask value is zero.
+    This behaviour is reversed if the negate transform is passed to the constructor. **/
+struct info_mask
 {
-    rocblas_int b = hipBlockIdx_y;
-    rocblas_int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-
-    if(i < n)
+    enum mask_transform
     {
-        T* v = load_ptr_batch<T>(A, b, shiftA, strideA);
-        T* w = load_ptr_batch<T>(W, b, shiftW, strideW);
-        T* s = S + b * strideS;
+        none,
+        negate
+    };
 
-        // scale
-        T alpha = -0.5 * s[0] * scl[b];
+    explicit constexpr info_mask(rocblas_int* mask, mask_transform transform = none) noexcept
+        : m_mask(mask)
+        , m_negate(transform == negate)
+    {
+    }
 
-        // axpy
-        w[i] = alpha * v[i] + w[i];
+    __device__ constexpr bool operator[](rocblas_int idx) const noexcept
+    {
+        return m_negate ^ !!m_mask[idx];
     }
-}
 
-template <typename T, typename U>
-ROCSOLVER_KERNEL void check_singularity(const rocblas_int n,
-                                        U A,
-                                        const rocblas_int shiftA,
-                                        const rocblas_int lda,
-                                        const rocblas_stride strideA,
-                                        rocblas_int* info)
+    rocblas_int* m_mask;
+    bool m_negate;
+};
+
+/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer
+    An optional mask can be provided to limit the copy to selected matricies in the batch
+    If uplo = rocblas_fill_upper, only the upper triangular part is copied
+    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
+template <typename T, typename U, typename Mask = no_mask>
+ROCSOLVER_KERNEL void copy_mat(copymat_direction direction,
+                               const rocblas_int m,
+                               const rocblas_int n,
+                               U A,
+                               const rocblas_int shiftA,
+                               const rocblas_int lda,
+                               const rocblas_stride strideA,
+                               T* buffer,
+                               const Mask mask = no_mask{},
+                               const rocblas_fill uplo = rocblas_fill_full,
+                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
 {
-    // Checks for singularities in the matrix and updates info to indicate where
-    // the first singularity (if any) occurs
-    int b = hipBlockIdx_x;
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
+    const rocblas_int ldb = m;
+    const rocblas_stride strideB = rocblas_stride(ldb) * n;
 
-    __shared__ rocblas_int _info;
+    const bool copy = mask[b];
 
-    if(hipThreadIdx_y == 0)
-        _info = 0;
-    __syncthreads();
+    const bool full = (uplo == rocblas_fill_full);
+    const bool upper = (uplo == rocblas_fill_upper);
+    const bool lower = (uplo == rocblas_fill_lower);
+    const bool cdiag = (diag == rocblas_diagonal_non_unit);
 
-    for(int i = hipThreadIdx_y; i < n; i += hipBlockDim_y)
+    if(i < m && j < n && copy)
     {
-        if(a[i + i * lda] == 0)
+        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
         {
-            rocblas_int _info_temp = _info;
-            while(_info_temp == 0 || _info_temp > i + 1)
-                _info_temp = atomicCAS(&_info, _info_temp, i + 1);
+            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+            T* Bp = &buffer[b * strideB];
+
+            if(direction == copymat_to_buffer)
+                Bp[i + j * ldb] = Ap[i + j * lda];
+            else // direction == copymat_from_buffer
+                Ap[i + j * lda] = Bp[i + j * ldb];
         }
     }
-    __syncthreads();
-
-    if(hipThreadIdx_y == 0)
-        info[b] = _info;
 }
 
-template <typename S, typename I>
-__device__ static void shell_sort_ascending(const I n, S* a, I* map = nullptr)
+/** COPY_MAT copies m-by-n array A into B
+    An optional mask can be provided to limit the copy to selected matricies in the batch
+    If uplo = rocblas_fill_upper, only the upper triangular part is copied
+    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
+template <typename T, typename U1, typename U2, typename Mask = no_mask>
+ROCSOLVER_KERNEL void copy_mat(const rocblas_int m,
+                               const rocblas_int n,
+                               U1 A,
+                               const rocblas_int shiftA,
+                               const rocblas_int lda,
+                               const rocblas_stride strideA,
+                               U2 B,
+                               const rocblas_int shiftB,
+                               const rocblas_int ldb,
+                               const rocblas_stride strideB,
+                               const Mask mask = no_mask{},
+                               const rocblas_fill uplo = rocblas_fill_full,
+                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
 {
-    // -----------------------------------------------
-    // Sort array a[0...(n-1)] and generate permutation vector
-    // in map[] if map[] is available
-    // Note: performs in a single thread block
-    // -----------------------------------------------
-    if((n <= 0) || (a == nullptr))
-    {
-        return;
-    };
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    bool const is_root_thread
-        = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0);
+    const bool copy = mask[b];
 
-    int constexpr ngaps = 8;
-    int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence
+    const bool full = (uplo == rocblas_fill_full);
+    const bool upper = (uplo == rocblas_fill_upper);
+    const bool lower = (uplo == rocblas_fill_lower);
+    const bool cdiag = (diag == rocblas_diagonal_non_unit);
 
-    auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
-        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
+    if(i < m && j < n && copy)
+    {
+        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
+        {
+            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+            T* Bp = load_ptr_batch<T>(B, b, shiftB, strideB);
+
+            Bp[i + j * ldb] = Ap[i + j * lda];
+        }
+    }
+}
+
+/** COPY_MAT copies m-by-n array A into buffer if copymat_to_buffer, or buffer into A if copymat_from_buffer
+    If uplo = rocblas_fill_upper, only the upper triangular part is copied
+    If uplo = rocblas_fill_lower, only the lower triangular part is copied
+    Only valid when A is complex and buffer real. If REAL, only works with real part of A;
+    if !REAL only works with imaginary part of A**/
+template <typename T, typename S, bool REAL, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
+ROCSOLVER_KERNEL void copy_mat(copymat_direction direction,
+                               const rocblas_int m,
+                               const rocblas_int n,
+                               U A,
+                               const rocblas_int shiftA,
+                               const rocblas_int lda,
+                               const rocblas_stride strideA,
+                               S* buffer,
+                               const rocblas_fill uplo = rocblas_fill_full,
+                               const rocblas_diagonal diag = rocblas_diagonal_non_unit)
+{
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
+    const rocblas_int ldb = m;
+    const rocblas_stride strideB = rocblas_stride(ldb) * n;
 
-    bool const has_map = (map != nullptr);
+    const bool lower = (uplo == rocblas_fill_lower);
+    const bool full = (uplo == rocblas_fill_full);
+    const bool upper = (uplo == rocblas_fill_upper);
+    const bool cdiag = (diag == rocblas_diagonal_non_unit);
 
-    __syncthreads();
-    if(has_map)
+    if(i < m && j < n)
     {
-        for(auto k = k_start; k < n; k += k_inc)
+        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
         {
-            map[k] = k;
-        };
-    };
-    __syncthreads();
+            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+            S* Bp = &buffer[b * strideB];
 
-    if(is_root_thread)
-    {
-        for(auto igap = 0; igap < ngaps; igap++)
-        {
-            auto const gap = gaps[igap];
+            if(direction == copymat_to_buffer)
+                Bp[i + j * ldb] = REAL ? Ap[i + j * lda].real() : Ap[i + j * lda].imag();
+            else if(REAL)
+                Ap[i + j * lda] = rocblas_complex_num<S>(Bp[i + j * ldb], Ap[i + j * lda].imag());
+            else
+                Ap[i + j * lda] = rocblas_complex_num<S>(Ap[i + j * lda].real(), Bp[i + j * ldb]);
+        }
+    }
+}
 
-            for(I i = gap; i < n; i += 1)
-            {
-                {
-                    S const temp = a[i];
-                    auto const itemp = (has_map) ? map[i] : 0;
+/** COPY_TRANS_MAT copies m-by-n array A into B and transpose it depending on the value of trans.
+    An optional mask can be provided to limit the copy to selected matricies in the batch
+    If uplo = rocblas_fill_upper, only the upper triangular part is copied
+    If uplo = rocblas_fill_lower, only the lower triangular part is copied **/
+template <typename T1, typename T2, typename Mask = no_mask>
+ROCSOLVER_KERNEL void copy_trans_mat(const rocblas_operation trans,
+                                     const rocblas_int m,
+                                     const rocblas_int n,
+                                     T1* A,
+                                     const rocblas_int shiftA,
+                                     const rocblas_int lda,
+                                     const rocblas_stride strideA,
+                                     T2* B,
+                                     const rocblas_int shiftB,
+                                     const rocblas_int ldb,
+                                     const rocblas_stride strideB,
+                                     const Mask mask = no_mask{},
+                                     const rocblas_fill uplo = rocblas_fill_full,
+                                     const rocblas_diagonal diag = rocblas_diagonal_non_unit)
+{
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-                    auto j = i;
+    const bool copy = mask[b];
 
-                    while((j >= gap) && (a[j - gap] > temp))
-                    {
-                        a[j] = a[j - gap];
-                        if(has_map)
-                        {
-                            map[j] = map[j - gap];
-                        };
-                        j -= gap;
-                    };
+    const bool full = (uplo == rocblas_fill_full);
+    const bool upper = (uplo == rocblas_fill_upper);
+    const bool lower = (uplo == rocblas_fill_lower);
+    const bool cdiag = (diag == rocblas_diagonal_non_unit);
 
-                    a[j] = temp;
-                    if(has_map)
-                    {
-                        map[j] = itemp;
-                    };
-                };
-            };
-        };
-    };
-    __syncthreads();
-#ifdef NDEBUG
-#else
-    if(n >= 2)
+    if(i < m && j < n && copy)
     {
-        for(auto k = k_start; k < (n - 1); k += k_inc)
+        if(full || (upper && j > i) || (lower && i > j) || (cdiag && i == j))
         {
-            assert(a[k] <= a[k + 1]);
-        };
-    };
-    __syncthreads();
-#endif
+            T1* Ap = load_ptr_batch<T1>(A, b, shiftA, strideA);
+            T2* Bp = load_ptr_batch<T2>(B, b, shiftB, strideB);
+
+            if(trans == rocblas_operation_conjugate_transpose)
+                Bp[j + i * ldb] = T2(conj(Ap[i + j * lda]));
+            else if(trans == rocblas_operation_transpose)
+                Bp[j + i * ldb] = T2(Ap[i + j * lda]);
+            else
+                Bp[i + j * ldb] = T2(Ap[i + j * lda]);
+        }
+    }
 }
 
-template <typename S, typename I>
-__device__ static void shell_sort_descending(const I n, S* a, I* map = nullptr)
+template <typename T, typename U>
+ROCSOLVER_KERNEL void init_ident(const rocblas_int m,
+                                 const rocblas_int n,
+                                 U A,
+                                 const rocblas_int shiftA,
+                                 const rocblas_int lda,
+                                 const rocblas_stride strideA)
 {
-    // -----------------------------------------------
-    // Sort array a[0...(n-1)] and generate permutation vector
-    // in map[] if map[] is available
-    // Note: performs in a single thread block
-    // -----------------------------------------------
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto b = hipBlockIdx_z;
 
-    if((n <= 0) || (a == nullptr))
+    if(i < m && j < n)
     {
-        return;
-    };
+        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
 
-    bool const is_root_thread
-        = (hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0);
+        if(i == j)
+            a[i + j * lda] = 1.0;
+        else
+            a[i + j * lda] = 0.0;
+    }
+}
 
-    int constexpr ngaps = 8;
-    int const gaps[ngaps] = {701, 301, 132, 57, 23, 10, 4, 1}; // # Ciura gap sequence
+template <typename T, typename I, typename U>
+ROCSOLVER_KERNEL void reset_info(T* info, const I n, U val, I incr = 0)
+{
+    I idx = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
 
-    auto const k_start = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
-        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
+    if(idx < n)
+        info[idx] = T(val) + incr * idx;
+}
 
-    auto const k_inc = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
+template <typename T, typename I, typename S, typename U>
+ROCSOLVER_KERNEL void reset_batch_info(U info, const rocblas_stride stride, const I n, S val)
+{
+    I idx = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
+    I b = hipBlockIdx_y;
 
-    bool const has_map = (map != nullptr);
+    T* inf = load_ptr_batch<T>(info, b, 0, stride);
+    if(idx < n)
+        inf[idx] = T(val);
+}
 
-    __syncthreads();
-    if(has_map)
-    {
-        for(auto k = k_start; k < n; k += k_inc)
-        {
-            map[k] = k;
-        };
-    };
-    __syncthreads();
+template <typename T, typename I>
+ROCSOLVER_KERNEL void get_array(T** out, T* in, rocblas_stride stride, I batch)
+{
+    I b = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
 
-    if(is_root_thread)
-    {
-        for(auto igap = 0; igap < ngaps; igap++)
-        {
-            auto const gap = gaps[igap];
+    if(b < batch)
+        out[b] = in + b * stride;
+}
 
-            for(rocblas_int i = gap; i < n; i += 1)
-            {
-                {
-                    S const temp = a[i];
-                    auto const itemp = (has_map) ? map[i] : 0;
+template <typename T, typename I, typename U>
+ROCSOLVER_KERNEL void shift_array(T** out, U in, rocblas_stride shift, I batch)
+{
+    I b = hipBlockIdx_x * static_cast<I>(hipBlockDim_x) + hipThreadIdx_x;
 
-                    auto j = i;
+    if(b < batch)
+        out[b] = in[b] + shift;
+}
 
-                    while((j >= gap) && (a[j - gap] < temp))
-                    {
-                        a[j] = a[j - gap];
-                        if(has_map)
-                        {
-                            map[j] = map[j - gap];
-                        };
-                        j -= gap;
-                    };
+template <typename T, typename U>
+ROCSOLVER_KERNEL void subtract_tau(const rocblas_int i,
+                                   const rocblas_int j,
+                                   U A,
+                                   const rocblas_int shiftA,
+                                   const rocblas_int lda,
+                                   const rocblas_stride strideA,
+                                   T* ipiv,
+                                   const rocblas_stride strideP)
+{
+    const auto b = hipBlockIdx_x;
+    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+    T* tau = ipiv + b * strideP;
 
-                    a[j] = temp;
-                    if(has_map)
-                    {
-                        map[j] = itemp;
-                    };
-                };
-            };
-        };
-    };
-    __syncthreads();
-#ifdef NDEBUG
-#else
-    if(n >= 2)
-    {
-        for(auto k = k_start; k < (n - 1); k += k_inc)
-        {
-            assert(a[k] >= a[k + 1]);
-        };
-    };
-    __syncthreads();
-#endif
+    T t = -(*tau);
+    *tau = t;
+    Ap[i + j * lda] = 1.0 + t;
+}
+
+template <typename T>
+ROCSOLVER_KERNEL void restau(const rocblas_int k, T* ipiv, const rocblas_stride strideP)
+{
+    const auto blocksizex = hipBlockDim_x;
+    const auto b = hipBlockIdx_y;
+    T* tau = ipiv + b * strideP;
+    const auto i = hipBlockIdx_x * blocksizex + hipThreadIdx_x;
+
+    if(i < k)
+        tau[i] = -tau[i];
 }
 
-template <typename S, typename I>
-__device__ static void shell_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true)
+template <typename T,
+          typename I,
+          typename S,
+          typename U,
+          std::enable_if_t<!rocblas_is_complex<T> || rocblas_is_complex<S>, int> = 0>
+ROCSOLVER_KERNEL void set_diag(S* D,
+                               const rocblas_stride shiftd,
+                               const rocblas_stride strided,
+                               U A,
+                               const rocblas_stride shifta,
+                               const I lda,
+                               const rocblas_stride stridea,
+                               const I n,
+                               bool set_one)
 {
-    if((n <= 0) || (a == nullptr))
-    {
-        return;
-    };
+    I b = hipBlockIdx_x;
+    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    I j = i + i * lda;
 
-    if(is_ascending)
+    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
+    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
+
+    if(i < n)
     {
-        shell_sort_ascending(n, a, map);
+        d[i] = a[j];
+        a[j] = set_one ? T(1) : a[j];
     }
-    else
-    {
-        shell_sort_descending(n, a, map);
-    };
 }
 
-template <typename S, typename I>
-__device__ static void selection_sort_ascending(const I n, S* D, I* map = nullptr)
+template <typename T,
+          typename I,
+          typename S,
+          typename U,
+          std::enable_if_t<rocblas_is_complex<T> && !rocblas_is_complex<S>, int> = 0>
+ROCSOLVER_KERNEL void set_diag(S* D,
+                               const rocblas_stride shiftd,
+                               const rocblas_stride strided,
+                               U A,
+                               const rocblas_stride shifta,
+                               const I lda,
+                               const rocblas_stride stridea,
+                               const I n,
+                               bool set_one)
 {
-    // ---------------------------------------------------
-    // Sort entries in D[0...(n-1)]
-    // generates permutation vector in map[] if map[] is available
-    // Note: performs in a single thread block
-    // ---------------------------------------------------
-    if((n <= 0) || (D == nullptr))
-    {
-        return;
-    };
+    I b = hipBlockIdx_x;
+    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    I j = i + i * lda;
 
-    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
-        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
+    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
+    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
 
-    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
+    if(i < n)
+    {
+        d[i] = a[j].real();
+        a[j] = set_one ? T(1) : a[j];
+    }
+}
 
-    auto const k_start = tid;
-    auto const k_inc = nthreads;
-    bool const is_root_thread = (tid == 0);
+template <typename T, typename I, typename S, typename U>
+ROCSOLVER_KERNEL void restore_diag(S* D,
+                                   const rocblas_stride shiftd,
+                                   const rocblas_stride strided,
+                                   U A,
+                                   const rocblas_stride shifta,
+                                   const I lda,
+                                   const rocblas_stride stridea,
+                                   const I n)
+{
+    I b = hipBlockIdx_x;
+    I i = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    I j = i + i * lda;
 
-    bool const has_map = (map != nullptr);
+    S* d = load_ptr_batch<S>(D, b, shiftd, strided);
+    T* a = load_ptr_batch<T>(A, b, shifta, stridea);
 
-    __syncthreads();
+    if(i < n)
+        a[j] = d[i];
+}
 
-    if(has_map)
+/** SET_ZERO inserts zeros in all the entries of a m-by-n matrix A.
+    If uplo = lower, the lower triangular part of A is kept unchanged.
+    If uplo = upper, the upper triangular part of A is kept unchanged **/
+template <typename T, typename U>
+ROCSOLVER_KERNEL void set_zero(const rocblas_int m,
+                               const rocblas_int n,
+                               U A,
+                               const rocblas_int shiftA,
+                               const rocblas_int lda,
+                               const rocblas_stride strideA,
+                               const rocblas_fill uplo = rocblas_fill_full)
+{
+    const auto b = hipBlockIdx_z;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const bool lower = (uplo == rocblas_fill_lower);
+    const bool full = (uplo == rocblas_fill_full);
+    const bool upper = (uplo == rocblas_fill_upper);
+
+    if(i < m && j < n)
     {
-        for(auto k = k_start; k < n; k += k_inc)
+        if(full || (lower && j > i) || (upper && i > j))
         {
-            map[k] = k;
-        };
-    };
+            T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+            Ap[i + j * lda] = 0.0;
+        }
+    }
+}
 
-    __syncthreads();
+template <typename T, typename U>
+ROCSOLVER_KERNEL void copyshift_right(const bool copy,
+                                      const rocblas_int dim,
+                                      U A,
+                                      const rocblas_int shiftA,
+                                      const rocblas_int lda,
+                                      const rocblas_stride strideA,
+                                      T* W,
+                                      const rocblas_int shiftW,
+                                      const rocblas_int ldw,
+                                      const rocblas_stride strideW)
+{
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    if(is_root_thread)
+    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
+
+    // make first row the identity
+    if(i == 0 && j == 0 && !copy)
+        Ap[0] = 1.0;
+
+    if(i < dim && j < dim && j <= i)
     {
-        for(I ii = 1; ii < n; ii++)
-        {
-            auto l = ii - 1;
-            auto m = l;
-            auto p = D[l];
-            auto ip = (has_map) ? map[l] : 0;
+        rocblas_int offset = j * (j + 1) / 2; // to acommodate in smaller array W
 
-            for(auto j = ii; j < n; j++)
-            {
-                if(D[j] < p)
-                {
-                    m = j;
-                    p = D[j];
-                    if(has_map)
-                    {
-                        ip = map[j];
-                    };
-                }
-            }
-            if(m != l)
-            {
-                D[m] = D[l];
-                D[l] = p;
+        if(copy)
+        {
+            // copy columns
+            Wp[i + j * ldw - offset] = (j == 0 ? 0.0 : Ap[i + 1 + (j - 1) * lda]);
+        }
+        else
+        {
+            // shift columns to the right
+            Ap[i + 1 + j * lda] = Wp[i + j * ldw - offset];
 
-                if(has_map)
-                {
-                    map[m] = map[l];
-                    map[l] = ip;
-                };
-            }
+            // make first row the identity
+            if(i == j)
+                Ap[(j + 1) * lda] = 0.0;
         }
     }
-    __syncthreads();
-#ifdef NDEBUG
-#else
-    if(n >= 2)
-    {
-        for(auto k = k_start; k < (n - 1); k += k_inc)
-        {
-            assert(D[k] <= D[k + 1]);
-        };
-    };
-    __syncthreads();
-#endif
 }
 
-template <typename S, typename I>
-__device__ static void selection_sort_descending(const I n, S* D, I* map = nullptr)
+template <typename T, typename U>
+ROCSOLVER_KERNEL void copyshift_left(const bool copy,
+                                     const rocblas_int dim,
+                                     U A,
+                                     const rocblas_int shiftA,
+                                     const rocblas_int lda,
+                                     const rocblas_stride strideA,
+                                     T* W,
+                                     const rocblas_int shiftW,
+                                     const rocblas_int ldw,
+                                     const rocblas_stride strideW)
 {
-    // ---------------------------------------------------
-    // Sort entries in D[0...(n-1)]
-    // generates permutation vector in map[] if map[] is available
-    // Note: performs in a single thread block
-    // ---------------------------------------------------
-    if((n <= 0) || (D == nullptr))
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
+
+    // make last row the identity
+    if(i == 0 && j == 0 && !copy)
+        Ap[dim + dim * lda] = 1.0;
+
+    if(i < dim && j < dim && i <= j)
     {
-        return;
-    };
+        rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W
 
-    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
-        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
+        if(copy)
+        {
+            // copy columns
+            Wp[i + j * ldw - offset] = (j == dim - 1 ? 0.0 : Ap[i + (j + 2) * lda]);
+        }
+        else
+        {
+            // shift columns to the left
+            Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset];
 
-    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
+            // make last row the identity
+            if(i == j)
+                Ap[dim + j * lda] = 0.0;
+        }
+    }
+}
 
-    auto const k_start = tid;
-    auto const k_inc = nthreads;
-    bool const is_root_thread = (tid == 0);
+template <typename T, typename U>
+ROCSOLVER_KERNEL void copyshift_down(const bool copy,
+                                     const rocblas_int dim,
+                                     U A,
+                                     const rocblas_int shiftA,
+                                     const rocblas_int lda,
+                                     const rocblas_stride strideA,
+                                     T* W,
+                                     const rocblas_int shiftW,
+                                     const rocblas_int ldw,
+                                     const rocblas_stride strideW)
+{
+    const auto b = hipBlockIdx_z;
+    const auto j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    const auto i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    bool const has_map = (map != nullptr);
+    T* Ap = load_ptr_batch<T>(A, b, shiftA, strideA);
+    T* Wp = load_ptr_batch<T>(W, b, shiftW, strideW);
 
-    __syncthreads();
+    // make first column the identity
+    if(i == 0 && j == 0 && !copy)
+        Ap[0] = 1.0;
 
-    if(has_map)
+    if(i < dim && j < dim && i <= j)
     {
-        for(auto k = k_start; k < n; k += k_inc)
-        {
-            map[k] = k;
-        };
-    };
-
-    __syncthreads();
+        rocblas_int offset = j * ldw - j * (j + 1) / 2; // to acommodate in smaller array W
 
-    if(is_root_thread)
-    {
-        for(I ii = 1; ii < n; ii++)
+        if(copy)
         {
-            auto l = ii - 1;
-            auto m = l;
-            auto p = D[l];
-            auto ip = (has_map) ? map[l] : 0;
-
-            for(auto j = ii; j < n; j++)
-            {
-                if(D[j] > p)
-                {
-                    m = j;
-                    p = D[j];
-                    if(has_map)
-                    {
-                        ip = map[j];
-                    };
-                }
-            }
-            if(m != l)
-            {
-                D[m] = D[l];
-                D[l] = p;
+            // copy rows
+            Wp[i + j * ldw - offset] = (i == 0 ? 0.0 : Ap[i - 1 + (j + 1) * lda]);
+        }
+        else
+        {
+            // shift rows downward
+            Ap[i + (j + 1) * lda] = Wp[i + j * ldw - offset];
 
-                if(has_map)
-                {
-                    map[m] = map[l];
-                    map[l] = ip;
-                };
-            }
+            // make first column the identity
+            if(i == j)
+                Ap[i + 1] = 0.0;
         }
     }
-    __syncthreads();
-#ifdef NDEBUG
-#else
-    if(n >= 2)
-    {
-        for(auto k = k_start; k < (n - 1); k += k_inc)
-        {
-            assert(D[k] >= D[k + 1]);
-        };
-    };
-    __syncthreads();
-#endif
 }
 
-template <typename S, typename I>
-__device__ static void selection_sort(const I n, S* a, I* map = nullptr, const bool is_ascending = true)
+/** set_offdiag kernel copies the off-diagonal element of A, which is the non-zero element
+    resulting by applying the Householder reflector to the working column, to E. Then set it
+    to 1 to prepare for the application of the Householder reflector to the rest of the matrix **/
+template <typename T, typename S, typename U, std::enable_if_t<!rocblas_is_complex<T>, int> = 0>
+ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count,
+                                  U A,
+                                  const rocblas_int shiftA,
+                                  const rocblas_stride strideA,
+                                  S* E,
+                                  const rocblas_stride strideE)
 {
-    if((n <= 0) || (a == nullptr))
-    {
-        return;
-    };
+    rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    if(is_ascending)
+    if(b < batch_count)
     {
-        selection_sort_ascending(n, a, map);
+        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
+        S* e = E + b * strideE;
+
+        e[0] = a[0];
+        a[0] = T(1);
     }
-    else
-    {
-        selection_sort_descending(n, a, map);
-    };
 }
 
-template <typename T, typename I>
-__device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev = -1)
+template <typename T, typename S, typename U, std::enable_if_t<rocblas_is_complex<T>, int> = 0>
+ROCSOLVER_KERNEL void set_offdiag(const rocblas_int batch_count,
+                                  U A,
+                                  const rocblas_int shiftA,
+                                  const rocblas_stride strideA,
+                                  S* E,
+                                  const rocblas_stride strideE)
 {
-    // --------------------------------------------
-    // perform swaps to implement permutation vector
-    // the permutation vector will be restored to the
-    // identity permutation 0,1,2,...
-    //
-    // Note: performs in a thread block
-    // --------------------------------------------
-    {
-        bool const has_work = (n >= 1) && (C != nullptr) && (ldc >= 1) && (map != nullptr);
-        if(!has_work)
-        {
-            return;
-        }
-    }
-
-    auto const tid = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x
-        + hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y);
-
-    auto const nthreads = (hipBlockDim_x * hipBlockDim_y) * hipBlockDim_z;
+    rocblas_int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    auto const k_start = tid;
-    auto const k_inc = nthreads;
+    if(b < batch_count)
+    {
+        T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
+        S* e = E + b * strideE;
 
-    bool const is_root_thread = (tid == 0);
+        e[0] = a[0].real();
+        a[0] = T(1);
+    }
+}
 
-    auto const nn = (nev >= 0) ? nev : n;
+/** scale_axpy kernel executes axpy to update tau computing the scalar alpha with other
+    results in different memopry locations **/
+template <typename T, typename U>
+ROCSOLVER_KERNEL void scale_axpy(const rocblas_int n,
+                                 T* scl,
+                                 T* S,
+                                 const rocblas_stride strideS,
+                                 U A,
+                                 const rocblas_int shiftA,
+                                 const rocblas_stride strideA,
+                                 T* W,
+                                 const rocblas_int shiftW,
+                                 const rocblas_stride strideW)
+{
+    rocblas_int b = hipBlockIdx_y;
+    rocblas_int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
-    for(I i = 0; i < nn; i++)
+    if(i < n)
     {
-        __syncthreads();
-
-        while(map[i] != i)
-        {
-            auto const map_i = map[i];
-            auto const map_ii = map[map[i]];
-
-            __syncthreads();
+        T* v = load_ptr_batch<T>(A, b, shiftA, strideA);
+        T* w = load_ptr_batch<T>(W, b, shiftW, strideW);
+        T* s = S + b * strideS;
 
-            if(is_root_thread)
-            {
-                map[map_i] = map_i;
-                map[i] = map_ii;
-            }
+        // scale
+        T alpha = -0.5 * s[0] * scl[b];
 
-            __syncthreads();
+        // axpy
+        w[i] = alpha * v[i] + w[i];
+    }
+}
 
-            // ---------------------------------------
-            // swap columns C( 0:(n-1),map_i) and C( 0:(n-1),map_ii)
-            // ---------------------------------------
+template <typename T, typename U>
+ROCSOLVER_KERNEL void check_singularity(const rocblas_int n,
+                                        U A,
+                                        const rocblas_int shiftA,
+                                        const rocblas_int lda,
+                                        const rocblas_stride strideA,
+                                        rocblas_int* info)
+{
+    // Checks for singularities in the matrix and updates info to indicate where
+    // the first singularity (if any) occurs
+    int b = hipBlockIdx_x;
 
-            for(auto k = k_start; k < n; k += k_inc)
-            {
-                auto const k_map_i = k + (map_i * static_cast<int64_t>(ldc));
-                auto const k_map_ii = k + (map_ii * static_cast<int64_t>(ldc));
+    T* a = load_ptr_batch<T>(A, b, shiftA, strideA);
 
-                auto const ctemp = C[k_map_i];
-                C[k_map_i] = C[k_map_ii];
-                C[k_map_ii] = ctemp;
-            }
+    __shared__ rocblas_int _info;
 
-            __syncthreads();
-        }
-    }
-#ifdef NDEBUG
-#else
-    // ----------------------------------------------------------
-    // extra check that map[] is restored to identity permutation
-    // ----------------------------------------------------------
+    if(hipThreadIdx_y == 0)
+        _info = 0;
     __syncthreads();
-    for(auto k = k_start; k < nn; k += k_inc)
+
+    for(int i = hipThreadIdx_y; i < n; i += hipBlockDim_y)
     {
-        assert(map[k] == k);
+        if(a[i + i * lda] == 0)
+        {
+            rocblas_int _info_temp = _info;
+            while(_info_temp == 0 || _info_temp > i + 1)
+                _info_temp = atomicCAS(&_info, _info_temp, i + 1);
+        }
     }
     __syncthreads();
-#endif
+
+    if(hipThreadIdx_y == 0)
+        info[b] = _info;
 }
 
 /** SWAP swaps the values of vectors x and y of dimension n.
diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp
index 3f44ccf9b..3d72a0cc9 100644
--- a/library/src/include/lib_host_helpers.hpp
+++ b/library/src/include/lib_host_helpers.hpp
@@ -30,6 +30,8 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
+#include <fmt/core.h>
+#include <fmt/ostream.h>
 #include <hip/hip_runtime.h>
 #include <rocblas/rocblas.h>
 
@@ -170,6 +172,20 @@ static double imag_part(rocblas_complex_num<double> z)
     return (z.imag());
 }
 
+static bool is_device_pointer(void* ptr)
+{
+    hipPointerAttribute_t dev_attributes;
+    if(ptr == nullptr)
+        return false;
+
+    auto istat = hipPointerGetAttributes(&dev_attributes, ptr);
+    if(istat != hipSuccess)
+        fmt::print(stderr, "is_device_pointer: istat = {} {}\n", istat, hipGetErrorName(istat));
+
+    assert(istat == hipSuccess);
+    return (dev_attributes.type == hipMemoryTypeDevice);
+}
+
 #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS
 // Ensure __assert_fail is declared.
 #if !__is_identifier(__assert_fail)
diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp
index 2e4177556..53091092c 100644
--- a/library/src/include/rocsolver_handle.hpp
+++ b/library/src/include/rocsolver_handle.hpp
@@ -37,6 +37,7 @@ struct rocsolver_handle_data_
     rocblas_int checksum;
 
     rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu;
+    rocsolver_alg_mode sterf_mode = rocsolver_alg_mode_gpu;
 };
 
 typedef struct rocsolver_handle_data_* rocsolver_handle_data;
diff --git a/library/src/include/rocsolver_hybrid_array.hpp b/library/src/include/rocsolver_hybrid_array.hpp
new file mode 100644
index 000000000..ca0abcf45
--- /dev/null
+++ b/library/src/include/rocsolver_hybrid_array.hpp
@@ -0,0 +1,272 @@
+/* **************************************************************************
+ * Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "common_host_helpers.hpp"
+#include "lib_host_helpers.hpp"
+#include "rocsolver/rocsolver.h"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+/* ROCSOLVER_HYBRID_ARRAY provides a wrapper for data arrays that is intended to simplify host memory
+   allocation as well as data transfers to and from the device. Hybrid algorithms can use rocsolver_hybrid_array
+   to read device data to the host so that the data may be operated upon, and then write the resultant data
+   back to the device. Two modes are supported:
+
+   * Standard mode: Used to read device data onto the host, which may then be operated upon by host code.
+     A typical workflow will call init_async to allocate a host buffer and read the data into the buffer,
+     execute host code on the buffered data through the pointer provided by the [] operator, and then call
+     write_to_device_async to write the resultant data back to the device.
+
+   * Pointers only mode: Used to read device pointers from a batched array onto the host, so that device
+     kernels may be called on each individual batch instance. A typical workflow will call init_pointers_only
+     to allocate a host buffer for the device pointers, and then execute device kernels on the pointers
+     provided by the [] operator. */
+template <typename T, typename I, typename U>
+struct rocsolver_hybrid_array
+{
+    I dim, batch_count;
+    rocblas_stride stride;
+
+    U src_array;
+    T** batch_array;
+    T* val_array;
+
+    /* Constructor */
+    rocsolver_hybrid_array()
+        : src_array(nullptr)
+        , batch_array(nullptr)
+        , val_array(nullptr)
+    {
+    }
+
+    /* Disallow copying. */
+    rocsolver_hybrid_array(const rocsolver_hybrid_array&) = delete;
+
+    /* Disallow assigning. */
+    rocsolver_hybrid_array& operator=(const rocsolver_hybrid_array&) = delete;
+
+    /* Destructor */
+    ~rocsolver_hybrid_array()
+    {
+        if(val_array)
+            free(val_array);
+        if(batch_array && (val_array || this->dim < 0))
+            free(batch_array);
+    }
+
+    /* Used to read device pointers from a batched array for use on the host; no other data is read from the
+       device. */
+    rocblas_status init_pointers_only(U array, rocblas_stride stride, I batch_count, hipStream_t stream)
+    {
+        if(val_array)
+            free(val_array);
+        if(batch_array && (val_array || this->dim < 0))
+            free(batch_array);
+
+        this->dim = -1;
+        this->src_array = array;
+        this->stride = stride;
+        this->batch_count = batch_count;
+
+        bool constexpr is_strided = (std::is_same<U, T*>::value || std::is_same<U, T* const>::value);
+        bool is_device = is_device_pointer((void*)array);
+
+        if(is_device)
+        {
+            // pointers only; don't allocate val_array
+            val_array = nullptr;
+
+            if(is_strided)
+            {
+                // data is strided; batch_array not needed
+                batch_array = nullptr;
+            }
+            else
+            {
+                // data is batched; read device pointers into batch_array
+                size_t batch_bytes = sizeof(T*) * batch_count;
+                batch_array = (T**)malloc(batch_bytes);
+                HIP_CHECK(
+                    hipMemcpyAsync(batch_array, array, batch_bytes, hipMemcpyDeviceToHost, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+            }
+        }
+        else
+        {
+            // data on host; use src_array directly
+            val_array = nullptr;
+
+            if(is_strided)
+                batch_array = nullptr;
+            else
+                batch_array = (T**)src_array;
+        }
+
+        return rocblas_status_success;
+    }
+    /* Used to read device data into a host buffer, which is managed by this class. Data for all batch instances
+       will be read into the buffer. */
+    rocblas_status init_async(I dim, U array, rocblas_stride stride, I batch_count, hipStream_t stream)
+    {
+        if(val_array)
+            free(val_array);
+        if(batch_array && (val_array || this->dim < 0))
+            free(batch_array);
+
+        if(dim < 0)
+            return rocblas_status_internal_error;
+
+        this->dim = dim;
+        this->src_array = array;
+        this->stride = stride;
+        this->batch_count = batch_count;
+
+        bool constexpr is_strided = (std::is_same<U, T*>::value || std::is_same<U, T* const>::value);
+        bool is_device = is_device_pointer((void*)array);
+
+        if(is_device)
+        {
+            // allocate space on host for data from device
+            size_t dim_bytes = sizeof(T) * dim;
+            size_t val_bytes = sizeof(T) * dim * batch_count;
+            val_array = (T*)malloc(val_bytes);
+
+            if(is_strided)
+            {
+                // data is strided; batch_array not needed
+                batch_array = nullptr;
+
+                // read data to val_array
+                if(batch_count == 1 || stride == dim)
+                {
+                    HIP_CHECK(hipMemcpyAsync(val_array, src_array, val_bytes, hipMemcpyDeviceToHost,
+                                             stream));
+                }
+                else
+                {
+                    for(I bid = 0; bid < batch_count; bid++)
+                    {
+                        HIP_CHECK(hipMemcpyAsync(val_array + bid * dim, src_array + bid * stride,
+                                                 dim_bytes, hipMemcpyDeviceToHost, stream));
+                    }
+                }
+            }
+            else
+            {
+                // data is batched; read device pointers into batch_array
+                size_t batch_bytes = sizeof(T*) * batch_count;
+                batch_array = (T**)malloc(batch_bytes);
+                HIP_CHECK(
+                    hipMemcpyAsync(batch_array, array, batch_bytes, hipMemcpyDeviceToHost, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+
+                // read data to val_array
+                for(I bid = 0; bid < batch_count; bid++)
+                {
+                    HIP_CHECK(hipMemcpyAsync(val_array + bid * dim, batch_array[bid], dim_bytes,
+                                             hipMemcpyDeviceToHost, stream));
+                }
+            }
+        }
+        else
+        {
+            // data on host; use src_array directly
+            val_array = nullptr;
+
+            if(is_strided)
+                batch_array = nullptr;
+            else
+                batch_array = (T**)src_array;
+        }
+
+        return rocblas_status_success;
+    }
+    /* Copies data from host buffer back to the device. Returns an error if initialized for pointers
+       only. */
+    rocblas_status write_to_device_async(hipStream_t stream)
+    {
+        if(!src_array)
+            return rocblas_status_internal_error;
+        if(dim < 0)
+            return rocblas_status_internal_error;
+
+        if(val_array)
+        {
+            size_t dim_bytes = sizeof(T) * dim;
+            size_t val_bytes = sizeof(T) * dim * batch_count;
+
+            if(!batch_array)
+            {
+                if(batch_count == 1 || stride == dim)
+                {
+                    HIP_CHECK(hipMemcpyAsync(src_array, val_array, val_bytes, hipMemcpyHostToDevice,
+                                             stream));
+                }
+                else
+                {
+                    for(I bid = 0; bid < batch_count; bid++)
+                    {
+                        HIP_CHECK(hipMemcpyAsync(src_array + bid * stride, val_array + bid * dim,
+                                                 dim_bytes, hipMemcpyHostToDevice, stream));
+                    }
+                }
+            }
+            else
+            {
+                for(I bid = 0; bid < batch_count; bid++)
+                {
+                    HIP_CHECK(hipMemcpyAsync(batch_array[bid], val_array + bid * dim, dim_bytes,
+                                             hipMemcpyHostToDevice, stream));
+                }
+            }
+        }
+
+        return rocblas_status_success;
+    }
+
+    /* Gets a pointer to the data for batch index bid. If initialized for pointers only, the
+       returned pointer may be on the device. Otherwise, the pointer is on the host. */
+    T* operator[](I bid)
+    {
+        if(!src_array)
+            return nullptr;
+
+        if(val_array)
+            return val_array + bid * dim;
+        else
+        {
+            if(batch_array)
+                return batch_array[bid];
+            else
+                return (T*)(src_array + bid * stride);
+        }
+    }
+};
+
+ROCSOLVER_END_NAMESPACE