From 742c2fd342cc6a5f9c3d18124e7b08e246378b43 Mon Sep 17 00:00:00 2001
From: Julio Machado Silva <161654951+jmachado-amd@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:00:49 -0700
Subject: [PATCH 1/2] Improve robustness of tests (part1) (#810)

* Improve robustness of SYGVDX/HEGVDX tests

* Preliminary changes to sygvdx's error bound computation

* Enable hashing output for sygvdx (#854)

* enable hash checking for sygvdx

* address feedback

* add support for getrf, potrf, syevx, sygvx

* amend comments, update sygvdx

(cherry picked from commit 38c7a3a2496cae4a547cd6121b875c25cfd098c5)

* Update sygvdx test to handle case in which number of eigenvalues differ

This commit updates sygvdx's test to allow it to gracefully handle the
case in which the number of eigenvalues computed in rocSOLVER differ
from the number of reference eigenvalues.

* Refactor sygvdx-hegvdx tests

This commit updates the error bounds and fixes bugs in the sygvdx-hegvdx
tests.  It also adds code to allow selecting whether the tests should
enforce equality on the number of reference and computed eivenvalues
(default is yes); and whether the tests should use new or legacy
error bounds (default is to use new error bounds).

* Add documentation and small improvements to code

* Update comments

* Update `clss::print_debug()` method

* Make sygvdx-hegvdx tests slightly more strict

* Apply clang-format

* Implement suggestions and improve comments

* Fix typos

* Fix typo

* Add minor fixes and improvements

* Add minor clarifications

* Add missing license and update dates on modified files

* Address review comments

---------

Co-authored-by: Jonah Quist <jonquist@amd.com>
(cherry picked from commit b94dba092aecb41bdcfcdd63a89b088e570f2125)
---
 .../common/lapack/testing_sygvdx_hegvdx.hpp   | 358 ++++++--
 clients/common/matrix_utils/host_matrix.hpp   |  82 +-
 .../matrix_utils/matrix_utils_detail.hpp      |  26 +-
 clients/common/misc/clss.hpp                  | 797 ++++++++++++++++++
 4 files changed, 1209 insertions(+), 54 deletions(-)
 create mode 100644 clients/common/misc/clss.hpp

diff --git a/clients/common/lapack/testing_sygvdx_hegvdx.hpp b/clients/common/lapack/testing_sygvdx_hegvdx.hpp
index 40075077e..b5e631ae9 100644
--- a/clients/common/lapack/testing_sygvdx_hegvdx.hpp
+++ b/clients/common/lapack/testing_sygvdx_hegvdx.hpp
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -27,8 +27,10 @@
 
 #pragma once
 
+#include "common/matrix_utils/matrix_utils.hpp"
 #include "common/misc/client_util.hpp"
 #include "common/misc/clientcommon.hpp"
+#include "common/misc/clss.hpp"
 #include "common/misc/lapack_host_reference.hpp"
 #include "common/misc/norm.hpp"
 #include "common/misc/rocsolver.hpp"
@@ -206,6 +208,69 @@ void testing_sygvdx_hegvdx_bad_arg()
     }
 }
 
+//
+// If the environment variable:
+//
+// ROCSOLVER_SYGVDX_HEGVDX_USE_LEGACY_TESTS
+//
+// is defined, `sygvdx_hegvdx_getError` will compute errors using the
+// legacy error bounds (for debugging purposes).
+//
+// Otherwise the new error bounds are always used.
+//
+static bool sygvdx_hegvdx_use_legacy_tests()
+{
+    bool status = false;
+    if(std::getenv("ROCSOLVER_SYGVDX_HEGVDX_USE_LEGACY_TESTS") != nullptr)
+    {
+        status = true;
+    }
+    return status;
+}
+
+//
+// The default behaviour of `sygvdx_hegvdx_getError()` is to check if the
+// number of computed eigenvalues match the number of reference eigenvalues,
+// and then to check all computed eigenvalues for their accuracy, but this
+// behaviour can be relaxed.  This leads to two modes of operation: a relaxed
+// check and a full (default) check.  Those are controlled by function
+// `test_for_equality_of_number_of_computed_eigenvalues()`, below, in the
+// following manner:
+//
+// a) If `ROCSOLVER_LAX_EIGENSOLVERS_TESTS` is defined, then the test suite
+// will only use the subset of computed eigenvalues that match reference
+// eigenvalues (up to the given tolerance); except
+//
+// b) If `ROCSOLVER_FULL_EIGENSOLVERS_TESTS` is defined, then the test suite
+// will unconditionally check all eigenvalues for their accuracy.
+//
+// The relaxed tests are intended as a means to decouple the computation of
+// error bounds of eigenvalues and eigenvectors, allowing tests to pass in the
+// case that not all eigenvalues could be accurately computed, but all accurate
+// eigenvalues have accurate eigenvectors.  If eigenvectors are not accurate,
+// the corresponding tests will fail both in full mode and in relaxed mode.
+//
+// Note: the relaxed version of the tests is only supported when using the new
+// error bounds, see also function `sygvdx_hegvdx_use_legacy_tests()`.
+//
+static bool test_for_equality_of_number_of_computed_eigenvalues()
+{
+    bool status = true;
+#if defined(ROCSOLVER_LAX_EIGENSOLVERS_TESTS)
+    status = false;
+#else
+    if(std::getenv("ROCSOLVER_LAX_EIGENSOLVERS_TESTS") != nullptr)
+    {
+        status = false;
+    }
+#endif
+    if(std::getenv("ROCSOLVER_FULL_EIGENSOLVERS_TESTS") != nullptr)
+    {
+        status = true;
+    }
+    return status;
+}
+
 template <bool CPU, bool GPU, typename T, typename Td, typename Th>
 void sygvdx_hegvdx_initData(const rocblas_handle handle,
                             const rocblas_eform itype,
@@ -233,11 +298,15 @@ void sygvdx_hegvdx_initData(const rocblas_handle handle,
         rocblas_init<T>(hA, true);
         rocblas_init<T>(U, true);
 
+        bool use_legacy_tests = sygvdx_hegvdx_use_legacy_tests();
+
         for(rocblas_int b = 0; b < bc; ++b)
         {
             // for testing purposes, we start with a reduced matrix M for the standard equivalent problem
             // with spectrum in a desired range (-20, 20). Then we construct the generalized pair
             // (A, B) from there.
+            memset(hB[b], 0,
+                   sizeof(T) * n * ldb); // since ldb >= n, make sure all entries of B are initialized
             for(rocblas_int i = 0; i < n; i++)
             {
                 // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular)
@@ -316,15 +385,23 @@ void sygvdx_hegvdx_initData(const rocblas_handle handle,
                 {
                     for(rocblas_int j = 0; j < n; j++)
                     {
-                        if(itype != rocblas_eform_bax)
+                        if(use_legacy_tests)
                         {
-                            A[b][i + j * lda] = hA[b][i + j * lda];
-                            B[b][i + j * ldb] = hB[b][i + j * ldb];
+                            if(itype != rocblas_eform_bax)
+                            {
+                                A[b][i + j * lda] = hA[b][i + j * lda];
+                                B[b][i + j * ldb] = hB[b][i + j * ldb];
+                            }
+                            else
+                            {
+                                A[b][i + j * lda] = hB[b][i + j * ldb];
+                                B[b][i + j * ldb] = hA[b][i + j * lda];
+                            }
                         }
                         else
                         {
-                            A[b][i + j * lda] = hB[b][i + j * ldb];
-                            B[b][i + j * ldb] = hA[b][i + j * lda];
+                            A[b][i + j * lda] = hA[b][i + j * lda];
+                            B[b][i + j * ldb] = hB[b][i + j * ldb];
                         }
                     }
                 }
@@ -378,6 +455,8 @@ void sygvdx_hegvdx_getError(const rocblas_handle handle,
                             double* max_err,
                             const bool singular)
 {
+    using HMat = HostMatrix<T, rocblas_int>;
+    using BDesc = typename HMat::BlockDescriptor;
     constexpr bool COMPLEX = rocblas_is_complex<T>;
 
     int lwork = (COMPLEX ? 2 * n : 8 * n);
@@ -390,11 +469,68 @@ void sygvdx_hegvdx_getError(const rocblas_handle handle,
     std::vector<int> hIfail(n);
     host_strided_batch_vector<T> A(lda * n, 1, lda * n, bc);
     host_strided_batch_vector<T> B(ldb * n, 1, ldb * n, bc);
+    std::vector<closest_largest_subsequences<S>> clss(bc);
+    std::vector<bool> skip_test(bc, false);
+
+    bool use_legacy_tests = sygvdx_hegvdx_use_legacy_tests();
+    bool test_for_equality = test_for_equality_of_number_of_computed_eigenvalues();
 
     // input data initialization
     sygvdx_hegvdx_initData<true, true, T>(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc,
                                           hA, hB, A, B, true, singular);
 
+    // CPU lapack
+    // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin
+    S atol = 2 * get_safemin<S>();
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol,
+                        hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(),
+                        hIfail.data(), hInfo[b]);
+
+        // Capture failures where B is not positive definite (hInfo[b][0] > n),
+        // or where the i-argument has an illegal value (hInfo[b][0] < 0).  All other LAPACK
+        // failures skip the test.
+        if((hInfo[b][0] > 0) && (hInfo[b][0] <= n))
+        {
+            skip_test[b] = true;
+        }
+    }
+
+    //
+    // Given an eigenvalue l_i of the symmetric matrix A and a computed
+    // eigenvalue l_i^* (obtained with a backward stable method), Weyl's
+    // theorem yields |l_i - l_i^*| <= K*ulp*||A||_2, where K depends on n.
+    // For the sake of this test, we will set K = C * n, with C ~ 1.
+    //
+    // Thus, if the range to look for eigenvalues is the interval (vl, vu],
+    // calls to the solver should look for computed eigenvalues in the range
+    // (vl - tol, vu + tol], where `tol = C * n * ulp * ||A||`.
+    //
+    S C = 4;
+    std::vector<S> tols(bc, 0);
+    std::vector<S> norms(bc, 0);
+    S tol = 0;
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        if(hNev[b][0] > 0)
+        {
+            // Get lapack eigenvalues (reference to which rocSOLVER's sygvdx will be compared to)
+            auto eigsLapack = *HMat::Convert(hW[b], hNev[b][0], 1);
+            norms[b] = eigsLapack.max_coeff_norm();
+        }
+        else
+        {
+            norms[b] = S(0);
+        }
+
+        tols[b] = C * n * std::numeric_limits<S>::epsilon() * norms[b];
+        if(std::isfinite(tols[b]) && (tols[b] > tol))
+        {
+            tol = tols[b];
+        }
+    }
+
     // execute computations
     // GPU lapack
     CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx(
@@ -407,101 +543,225 @@ void sygvdx_hegvdx_getError(const rocblas_handle handle,
     if(evect != rocblas_evect_none)
         CHECK_HIP_ERROR(hZRes.transfer_from(dZ));
 
-    // CPU lapack
-    // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin
-    S atol = 2 * get_safemin<S>();
-    for(rocblas_int b = 0; b < bc; ++b)
-    {
-        cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol,
-                        hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(),
-                        hIfail.data(), hInfo[b]);
-    }
-
-    // (We expect the used input matrices to always converge. Testing
-    // implicitly the equivalent non-converged matrix is very complicated and it boils
-    // down to essentially run the algorithm again and until convergence is achieved.
-    // We do test with indefinite matrices B).
+    // Except for the cases in which B is indefinite, we expect the eigensolver
+    // to converge for all input matrices.
 
-    // check info for non-convergence and/or positive-definiteness
+    // check info for illegal values and/or positive-definiteness
     *max_err = 0;
     for(rocblas_int b = 0; b < bc; ++b)
     {
+        // Capture failures where B is not positive definite (hInfo[b][0] > n),
+        // or where the i-argument has an illegal value (hInfo[b][0] < 0).  All other LAPACK
+        // failures skip the test.
+        if(skip_test[b])
+            continue;
+
         EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b;
         if(hInfo[b][0] != hInfoRes[b][0])
             *max_err += 1;
-    }
 
-    // Check number of returned eigenvalues
-    for(rocblas_int b = 0; b < bc; ++b)
-    {
-        EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b;
-        if(hNev[b][0] != hNevRes[b][0])
-            *max_err += 1;
+        auto numMatchingEigs = clss[b](hW[b], hNev[b][0], hWRes[b], hNevRes[b][0], tols[b]);
+        if(test_for_equality)
+        {
+            EXPECT_EQ(hNev[b][0], numMatchingEigs) << "where b = " << b;
+            if(hNev[b][0] != numMatchingEigs)
+                *max_err += 1;
+        }
     }
 
+    //
+    // Compute errors
+    //
     double err;
 
     for(rocblas_int b = 0; b < bc; ++b)
     {
+        auto [lapackEigs, rocsolverEigs] = clss[b].subseqs();
+        auto [_, rocsolverEigsIds] = clss[b].subseqs_ids();
+        auto numMatchingEigs = rocsolverEigs.size();
+
+        // Number of eigenvalues computed by rocSOLVER
+        auto numRocsolverEigs = hNevRes[b][0];
+
+        // Only check accuracy for tests in which both computed and reference values exist and are well defined.
+        if(skip_test[b] || (numMatchingEigs == 0) || (hInfo[b][0] != 0))
+            continue;
+
         if(evect == rocblas_evect_none)
         {
-            // only eigenvalues needed; can compare with LAPACK
+            //
+            // Only eigenvalues
+            //
 
-            // error is ||hW - hWRes|| / ||hW||
-            // using frobenius norm
-            if(hInfo[b][0] == 0)
+            if(use_legacy_tests)
+            {
+                err = norm_error('F', 1, numMatchingEigs, 1, lapackEigs.data(), rocsolverEigs.data());
+                *max_err = err > *max_err ? err : *max_err;
+            }
+            else
             {
-                err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]);
+                // Get computed eigenvalues
+                auto eigs
+                    = *HMat::Convert(rocsolverEigs.data(), rocsolverEigs.size(),
+                                     1); // convert eigenvalues from type S to type T, if required
+
+                // Get lapack (reference) eigenvalues
+                auto eigsRef
+                    = *HMat::Convert(lapackEigs.data(), lapackEigs.size(),
+                                     1); // convert eigenvalues from type S to type T, if required
+                err = (eigs - eigsRef).norm() / eigsRef.norm();
                 *max_err = err > *max_err ? err : *max_err;
             }
         }
         else
         {
-            // both eigenvalues and eigenvectors needed; need to implicitly test
-            // eigenvectors due to non-uniqueness of eigenvectors under scaling
-            if(hInfo[b][0] == 0)
+            //
+            // Both eigenvalues and eigenvectors
+            //
+
+            if(use_legacy_tests)
             {
                 T alpha = 1;
                 T beta = 0;
 
                 // hZRes contains eigenvectors x
                 // compute B*x (or A*x) and store in hB
-                cpu_symm_hemm(rocblas_side_left, uplo, n, hNev[b][0], alpha, B[b], ldb, hZRes[b],
-                              ldz, beta, hB[b], ldb);
+                cpu_symm_hemm(rocblas_side_left, uplo, n, numRocsolverEigs, alpha, B[b], ldb,
+                              hZRes[b], ldz, beta, hB[b], ldb);
 
+                auto [_, hWResIds] = clss[b].subseqs_ids();
                 if(itype == rocblas_eform_ax)
                 {
                     // problem is A*x = (lambda)*B*x
 
                     // compute (1/lambda)*A*x and store in hA
-                    for(int j = 0; j < hNev[b][0]; j++)
+                    for(int j = 0; j < numMatchingEigs; j++)
                     {
-                        alpha = T(1) / hWRes[b][j];
-                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hZRes[b] + j * ldz, 1, beta,
+                        int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair
+                        alpha = T(1) / hWRes[b][jj];
+                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hZRes[b] + jj * ldz, 1, beta,
                                       hA[b] + j * lda, 1);
                     }
 
                     // move B*x into hZRes
                     for(rocblas_int i = 0; i < n; i++)
-                        for(rocblas_int j = 0; j < hNev[b][0]; j++)
-                            hZRes[b][i + j * ldz] = hB[b][i + j * ldb];
+                    {
+                        for(rocblas_int j = 0; j < numMatchingEigs; j++)
+                        {
+                            int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair
+                            hZRes[b][i + j * ldz] = hB[b][i + jj * ldb];
+                        }
+                    }
                 }
                 else
                 {
                     // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x
 
                     // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA
-                    for(int j = 0; j < hNev[b][0]; j++)
+                    for(int j = 0; j < numMatchingEigs; j++)
                     {
-                        alpha = T(1) / hWRes[b][j];
-                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta,
+                        int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair
+                        alpha = T(1) / hWRes[b][jj];
+                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + jj * ldb, 1, beta,
                                       hA[b] + j * lda, 1);
                     }
+                    // move hZRes
+                    for(rocblas_int i = 0; i < n; i++)
+                    {
+                        for(rocblas_int j = 0; j < numMatchingEigs; j++)
+                        {
+                            int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair
+                            if(j != jj)
+                                hZRes[b][i + j * ldz] = hZRes[b][i + jj * ldz];
+                        }
+                    }
                 }
 
                 // error is ||hA - hZRes|| / ||hA||
                 // using frobenius norm
-                err = norm_error('F', n, hNev[b][0], lda, hA[b], hZRes[b], ldz);
+                err = norm_error('F', n, numMatchingEigs, lda, hA[b], hZRes[b], ldz);
+                *max_err = err > *max_err ? err : *max_err;
+            }
+            else // if(!use_legacy_tests)
+            {
+                //
+                // Prepare input
+                //
+
+                // Get computed eigenvalues
+                auto eigs
+                    = *HMat::Convert(rocsolverEigs.data(), rocsolverEigs.size(),
+                                     1); // convert eigenvalues from type S to type T, if required
+
+                // Get lapack (reference) eigenvalues
+                auto eigsRef
+                    = *HMat::Convert(lapackEigs.data(), lapackEigs.size(),
+                                     1); // convert eigenvalues from type S to type T, if required
+
+                // Create thin wrappers of input matrices A and B
+                auto AWrap = HMat::Wrap(A.data() + b * lda * n, lda, n);
+                auto BWrap = HMat::Wrap(B.data() + b * ldb * n, ldb, n);
+
+                // We want the sub-blocks starting from row 0, col 0 and with size n x n of A and B
+                auto A_b = (*AWrap).block(BDesc().nrows(n).ncols(n));
+                auto B_b = (*BWrap).block(BDesc().nrows(n).ncols(n));
+
+                // Get computed eigenvectors
+                auto V_b
+                    = (*HMat::Wrap(hZRes[b], ldz, n)).block(BDesc().nrows(n).ncols(numRocsolverEigs));
+
+                // If rocSOLVER computed more eigen-pairs then the number of
+                // reference eigenvalues, select the eigen-pairs that match the
+                // reference
+                if(numRocsolverEigs > numMatchingEigs)
+                {
+                    rocblas_int ii;
+                    for(rocblas_int i = 0; i < numMatchingEigs; ++i)
+                    {
+                        ii = rocsolverEigsIds[i];
+                        V_b.col(i, V_b.col(ii));
+                    }
+                    V_b = V_b.block(BDesc().nrows(n).ncols(numMatchingEigs));
+                }
+
+                //
+                // Check eigenpairs' accuracy with a "Relative Weyl" error
+                // bound, which (at its simplest form) states the following.
+                //
+                // Let X (cond(X) < Inf), and A (A^* = A) be such that A has
+                // eigenvalues {a_i} and H = X^t*A*X has eigenvalues {h_i}.
+                // Then:
+                //
+                // |a_i - h_i| <= |a_i|*||X^t*X - I||_2
+                //
+                // Note: for rocSOLVER's sygv, if V is the eigenvectors' matrix
+                // and B = L*L^t, then either X = L^t*V (cases 1 and 2) or X =
+                // inv(L)*V (case 3).
+                //
+                auto VE = HMat::Empty();
+                if(itype == rocblas_eform_bax)
+                {
+                    VE = adjoint(V_b) * inv(B_b) * V_b - HMat::Eye(numMatchingEigs);
+                }
+                else // if ((itype == rocblas_eform_ax) || (itype == rocblas_eform_abx))
+                {
+                    VE = adjoint(V_b) * B_b * V_b - HMat::Eye(numMatchingEigs);
+                }
+                S eta = std::max(VE.norm(), std::numeric_limits<S>::epsilon());
+                *max_err = eta > *max_err ? eta : *max_err;
+
+                auto AE = HMat::Empty();
+                if(itype == rocblas_eform_abx)
+                {
+                    auto Z = B_b * V_b;
+                    AE = adjoint(Z) * A_b * Z - HMat::Zeros(numMatchingEigs).diag(eigs);
+                }
+                else // if ((itype == rocblas_eform_ax) || (itype == rocblas_eform_bax))
+                {
+                    AE = adjoint(V_b) * A_b * V_b - HMat::Zeros(numMatchingEigs).diag(eigs);
+                }
+                err = AE.norm() / eigsRef.norm();
+                err *= std::numeric_limits<S>::epsilon() / eta;
                 *max_err = err > *max_err ? err : *max_err;
             }
         }
@@ -834,9 +1094,9 @@ void testing_sygvdx_hegvdx(Arguments& argus)
     }
 
     // validate results for rocsolver-test
-    // using 3 * n * machine_precision as tolerance
+    // using 4 * n * machine_precision as tolerance
     if(argus.unit_check)
-        ROCSOLVER_TEST_CHECK(T, max_error, 3 * n);
+        ROCSOLVER_TEST_CHECK(T, max_error, 4 * n);
 
     // output results for rocsolver-bench
     if(argus.timing)
diff --git a/clients/common/matrix_utils/host_matrix.hpp b/clients/common/matrix_utils/host_matrix.hpp
index fd84e100b..df6b2eae6 100644
--- a/clients/common/matrix_utils/host_matrix.hpp
+++ b/clients/common/matrix_utils/host_matrix.hpp
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -113,14 +113,14 @@ class HostMatrix : public MatrixInterface<T_, I_>
         return ptr;
     }
 
-    template <typename S_>
-    static auto Convert(const HostMatrix<S_, I_>& In) -> HostMatrix<T_, I_>
+    template <template <typename, typename> class HostMatrix_, typename TT_, typename II_>
+    static auto Convert(const HostMatrix_<TT_, II_>& In) -> HostMatrix<T_, I_>
     {
         HostMatrix<T_, I_> Out(In.nrows(), In.ncols());
 
         for(I i = 0; i < Out.size(); ++i)
         {
-            Out[i] = T(In[i]);
+            Out[i] = T_(In[i]);
         }
 
         return Out;
@@ -440,6 +440,7 @@ class HostMatrix : public MatrixInterface<T_, I_>
 
         nrows_ = nrows;
         ncols_ = ncols;
+        ld_ = nrows;
 
         return true;
     }
@@ -1316,4 +1317,77 @@ auto eig_lower(const HostMatrix_<T, I>& A)
     return std::make_tuple(U, Lambda);
 }
 
+template <template <typename, typename> class HostMatrix_, typename T, typename I>
+auto svd(const HostMatrix_<T, I>& A) -> std::tuple<HostMatrix_<T, I> /* Left Singular Vectors: U */,
+                                                   HostMatrix_<T, I> /* Singular Values */,
+                                                   HostMatrix_<T, I> /* Right Singular Vectors */>
+{
+    using S = typename HostMatrix_<T, I>::S;
+
+    I nrows = A.nrows();
+    I ncols = A.ncols();
+
+    I dim = std::min(nrows, ncols);
+    HostMatrix_<T, I> U(nrows, nrows), V(ncols, ncols), Sigma(nrows, ncols);
+    HostMatrix_<S, I> sigma_diag(dim, 1);
+
+    if constexpr(std::is_same<std::decay_t<I>, int>::value)
+    {
+        detail::lapack_ge_svd(A.data(), nrows, ncols, U.data(), sigma_diag.data(), V.data());
+    }
+    else
+    {
+        bool within_lapack_limits
+            = static_cast<std::int64_t>(nrows) * static_cast<std::int64_t>(ncols)
+                <= static_cast<std::int64_t>(std::numeric_limits<int>::max())
+            && static_cast<std::int64_t>(nrows)
+                <= static_cast<std::int64_t>(std::numeric_limits<int>::max())
+            && static_cast<std::int64_t>(ncols)
+                <= static_cast<std::int64_t>(std::numeric_limits<int>::max());
+
+        if(within_lapack_limits)
+        {
+            detail::lapack_ge_svd(A.data(), static_cast<int>(nrows), static_cast<int>(ncols),
+                                  U.data(), sigma_diag.data(), V.data());
+        }
+        else
+        {
+            throw std::domain_error(
+                "Error computing svd(A): A.nrows(), A.ncols(), A.nrows()*A.ncols() must be "
+                "smaller or equal to INT_MAX");
+        }
+    }
+
+    // Lapack *gesvd returns V^* instead of V.
+    Sigma.diag(HostMatrix_<T, I>::Convert(sigma_diag));
+    V = adjoint(V);
+    return std::make_tuple(U, Sigma, V);
+}
+
+template <template <typename, typename> class HostMatrix_, typename T, typename I>
+auto inv(const HostMatrix_<T, I>& A) -> HostMatrix_<T, I> /* Pseudo-Inverse of A */
+{
+    using S = typename HostMatrix_<T, I>::S;
+
+    auto [U, Sigma, V] = svd(A);
+    I nrows = A.nrows();
+    I ncols = A.ncols();
+    I dim = std::min(nrows, ncols);
+
+    for(I i = 0; i < dim; ++i)
+    {
+        if(std::abs(Sigma(i, i)) > std::max(std::numeric_limits<S>::min(), S(0)))
+        {
+            Sigma(i, i) = T(1) / Sigma(i, i);
+        }
+        else
+        {
+            Sigma(i, i) = T(0);
+        }
+    }
+
+    auto iA = adjoint(U * Sigma * adjoint(V));
+    return iA;
+}
+
 } // namespace matxu
diff --git a/clients/common/matrix_utils/matrix_utils_detail.hpp b/clients/common/matrix_utils/matrix_utils_detail.hpp
index ca41a255a..1f890fa04 100644
--- a/clients/common/matrix_utils/matrix_utils_detail.hpp
+++ b/clients/common/matrix_utils/matrix_utils_detail.hpp
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -191,6 +191,30 @@ namespace detail
         return (info == 0);
     }
 
+    // Compute singular values and singular vectors of A with lapack_*gesvd
+    template <typename T, typename S>
+    bool lapack_ge_svd(T const* A, const int nrows, const int ncols, T* U, S* D, T* V)
+    {
+        if(A == nullptr || nrows < 1 || ncols < 1)
+        {
+            return false;
+        }
+
+        int info;
+        int worksize = 32 * std::max(1, 2 * std::min(nrows, ncols) + std::max(nrows, ncols));
+        std::vector<T> work(worksize, T(0.));
+        int worksize_real = 5 * std::min(nrows, ncols);
+        std::vector<S> work_real(worksize_real, S(0.));
+        T* Acpy;
+        Acpy = (T*)malloc(sizeof(T) * nrows * ncols);
+        memcpy(Acpy, A, sizeof(T) * nrows * ncols);
+        cpu_gesvd(rocblas_svect_all, rocblas_svect_all, nrows, ncols, Acpy, nrows, D, U, nrows, V,
+                  ncols, work.data(), worksize, work_real.data(), &info);
+        free(Acpy);
+
+        return (info == 0);
+    }
+
 } // namespace detail
 
 } // namespace matxu
diff --git a/clients/common/misc/clss.hpp b/clients/common/misc/clss.hpp
new file mode 100644
index 000000000..3eaf7c30f
--- /dev/null
+++ b/clients/common/misc/clss.hpp
@@ -0,0 +1,797 @@
+/* **************************************************************************
+ * Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <cstring>
+#include <iomanip>
+#include <mutex>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+//
+// @brief `class closest_largest_subsequences`: Functor to compute the closest
+// largest subsequences of a given pair of sequences.
+//
+// Given a tolerance `tol` and a pair of sequences:
+//
+// (a_i), (b_j) with 0 <= i <= n, 0 <= j <= m;
+//
+// `closest_largests_subsequences` (`clss`) extracts the subsequences:
+//
+// (a_l) .=. (a_l1, a_l2, ..., a_lP) with i <= l1 < l2 < ... < lP <= n, and
+// (b_k) .=. (b_k1, b_k2, ..., b_kP) with j <= k1 < k2 < ... < kP <= m;
+//
+// (where 0 <= P <= n, m) that satisfy the following properties:
+//
+// 1. |a_l1 - b_k1| <= tol, |a_l2 - b_k2| <= tol, ..., |a_lP - b_kP| <= tol;
+//
+// 2. maximizes P (the size of the subsequences); and, for this maximal P,
+//
+// 3. minimizes ||a_l - b_k||_1 = \sum_{1 <= q <= P} |a_l_q - b_k_q|;
+//
+// in O(max{n, m}^2) space and time.  For a commented example, see Usage
+// section down below.
+//
+// \tparam T Type of elements in sequences (a_i), (b_j); expected to be an
+// arithmetic type; otherwise, T must be endowed with an overload to
+// `operator<` that defines a strict partial ordear.
+//
+// \tparam I Signed integer type to index the sequences.
+//
+//
+// ## Usage:
+//
+// Functor `clss` primary use is to improve the tests of the expert
+// eigensolvers' drivers, and allow extracting a sub-sequence of the computed
+// eigenvalues that matches a given list of eigenvalues.
+//
+// For example, consider the use of the bisection driver (STEBZ) to compute the
+// eigenvalues of a matrix A with two irreducible blocks.  The spectrum of A is
+// given as:
+//
+// - eig(A) = {-2., -1., 1., 2., -3., -2., -1., 1., 2.}.
+//
+// Say that the eigenvalues computed by STEBZ (grouped with the "by block"
+// ordering) are (to working precision `eps` = 0.015):
+//
+// - STEBZ::eig(A) = {-2., -0.99, 1.01, 1.99, -3.0, -2.01, -0.99, 1.01, 2.01},
+//
+// and those are meant to be compared with matrix eig(A) in the range (-1, 2].
+// One would find that
+//
+// - eig(A) \intersect (-1, 2] = {1., 2., 1., 2.}; whereas
+//
+// - STEBZ::eig(A) \intersect (-1, 2] = {-0.99, 1.01, 1.99, -0.99, 1.01}.
+//
+// Even though the computation is correct to working precision, the sets
+// `eig(A)` and `STEBZ::eig(A)` have different sizes (which breaks tests that
+// target their equality) and unmatched eigenvalues (which breaks tests that
+// compare the eigenvalues directly).
+//
+// One can avoid such problems by, instead, comparing the sub-sequences
+// produced by using functor `clss` with inputs:
+//
+// i) eig(A) \intersect (-1, 2];
+//
+// ii) STEBZ eigenvalues in the interval (-1 - tol, 2 + tol]; and
+//
+// iii) tolerance `tol` (which will be arbitrarily set to 2*`eps` = 0.03 here;
+// in general, `tol` is a function of `eps` and matrix A).
+//
+// For this example, such a call would look like:
+//
+// - `clss({1., 2., 1., 2.}, {-0.99, 1.01, 1.99, -0.99, 1.01, 2.01}, tol)`;
+//
+// which yields the subsequences (obtained with `clss::subseqs`):
+//
+// - {1.,   2.,   1.,   2.}, (i.e., the reference eigenvalues) and
+//
+// - {1.01, 1.99, 1.01, 2.01};
+//
+// where the latter is the maximal subsequence of STEBZ::eig(A) that satisfies
+// properties (1), (2) and (3) of the definition of `clss` above.
+//
+// For this example, the computed `clss::distance` (i.e., the subsequences' l^1
+// distance) is 0.04, and the computed `clss::inf_norm_distance` (i.e., the
+// sub-sequences' l^\inf distance) is `0.01`.
+//
+// Moreover, method `clss::subseqs_ids` returns the indices of the elements of
+// the subsequences in their original sequences.  For this example,
+// `clss::subseqs_ids` would return:
+//
+// - {0, 1, 2, 3}, (i.e., indices in the reference eigenvalues list) and
+//
+// - {1, 2, 4, 5};
+//
+// where the latter contains the indices of the elements of the second
+// subsequence ({1.01, 1.99, 1.01, 2.01}) with respect to the original sequence
+// they belong to ({-0.99, 1.01, 1.99, -0.99, 1.01, 2.01}, meant to have been
+// computed by STEBZ).
+//
+template <typename T,
+          typename I = std::int64_t,
+          typename = typename std::enable_if<std::is_signed<std::decay_t<I>>::value>::type>
+class closest_largest_subsequences
+{
+public:
+    using S = decltype(std::real(T{}));
+
+    //
+    // Computes the largest closest subsequences of input sequences `a` and `b`.
+    //
+    // \param a:      pointer to first sequence, array of const T.
+    //
+    // \param size_a: number of elements in first sequence.
+    //
+    // \param b:      pointer to second sequence, array of const T.
+    //
+    // \param size_b: number of elements in second sequence.
+    //
+    // \return size of subsequences (equals the maximal number of matching
+    // elements of the original sequences)
+    //
+
+    [[maybe_unused]] auto operator()(T const* a, I size_a, T const* b, I size_b, S tol)
+        -> /**! Size of subsequences */ I
+    {
+        std::lock_guard<std::mutex> lock(m_);
+
+        clear();
+        if((size_a > 0) && (size_b > 0) && (tol >= 0))
+        {
+            //
+            // Initialize members
+            //
+            this->tol_ = tol;
+            this->size_a_ = size_a;
+            this->size_b_ = size_b;
+            this->memo_distances_.resize(size_a * size_b, std::numeric_limits<S>::infinity());
+            this->memo_sizes_.resize(size_a * size_b, S(-1));
+            this->memo_next_.resize(size_a * size_b, I(-1));
+            // Copy original sequences for debugging purposes
+            this->seq_a_.resize(size_a, T(0));
+            memcpy(seq_a_.data(), a, sizeof(T) * size_a);
+            this->seq_b_.resize(size_b, T(0));
+            memcpy(seq_b_.data(), b, sizeof(T) * size_b);
+
+            //
+            // Call recursive, memoized, implementation to compute subsequences
+            //
+            auto [distance, sseqs_size, _] = clss_implr(a, size_a - 1, b, size_b - 1);
+            this->distance_ = distance;
+            this->sseqs_size_ = sseqs_size;
+
+            //
+            // Extract `sseq_a_` and `sseq_b_` from `a` and `b` and set:
+            // inf_norm_ = ||sseq_a_ - sseq_b_||_inf
+            //
+            this->inf_norm_ = extract_subsequences(a, size_a, b, size_b);
+        }
+
+        return sseqs_size_;
+    }
+
+    //
+    // Computes the largest closest subsequences of input sequences `a` and `b`.
+    //
+    // \param a:      pointer to first sequence, array of T.
+    //
+    // \param size_a: number of elements in first sequence.
+    //
+    // \param b:      pointer to second sequence, array of T.
+    //
+    // \param size_b: number of elements in second sequence.
+    //
+    // \return size of subsequences (equals the maximal number of matching
+    // elements of the original sequences)
+    //
+    [[maybe_unused]] auto operator()(T* a, I size_a, T* b, I size_b, S tol)
+        -> /**! Size of subsequences */ I
+    {
+        return this->operator()(const_cast<T const*>(a), size_a, const_cast<T const*>(b), size_b,
+                                tol);
+    }
+
+    //
+    // Computes the largest closest subsequences of input sequences `a` and `b`.
+    //
+    // \param a:      pointer to first sequence, array of const T.
+    //
+    // \param size_a: number of elements in first sequence; type can differ from
+    // template parameter I.
+    //
+    // \param b:      pointer to second sequence, array of const T.
+    //
+    // \param size_b: number of elements in second sequence; type can differ from
+    // template parameter I.
+    //
+    // \return size of subsequences (equals the maximal number of matching
+    // elements of the original sequences)
+    //
+    template <typename J, typename = typename std::enable_if<std::is_integral<J>::value>::type>
+    [[maybe_unused]] auto operator()(T const* a, J size_a, T const* b, J size_b, S tol)
+        -> /**! Size of subsequences */ I
+    {
+        return this->operator()(a, static_cast<I>(size_a), b, static_cast<I>(size_b), tol);
+    }
+
+    //
+    // Computes the largest closest subsequences of input sequences `a` and `b`.
+    //
+    // \param a:      pointer to first sequence, array of T.
+    //
+    // \param size_a: number of elements in first sequence; type can differ from
+    // template parameter I.
+    //
+    // \param b:      pointer to second sequence, array of T.
+    //
+    // \param size_b: number of elements in second sequence; type can differ from
+    // template parameter I.
+    //
+    // \return size of subsequences (equals the maximal number of matching
+    // elements of the original sequences)
+    //
+    template <typename J, typename = typename std::enable_if<std::is_integral<J>::value>::type>
+    [[maybe_unused]] auto operator()(T* a, J size_a, T* b, J size_b, S tol)
+        -> /**! Size of subsequences */ I
+    {
+        return this->operator()(const_cast<T const*>(a), static_cast<I>(size_a),
+                                const_cast<T const*>(b), static_cast<I>(size_b), tol);
+    }
+
+    //
+    // Computes the largest closest subsequences of input sequences `a` and `b`.
+    //
+    // \param a:      first sequence, const vector of T.
+    //
+    // \param b:      second sequence, const vector of T.
+    //
+    // \return size of subsequences (equals the maximal number of matching
+    // elements of the original sequences)
+    //
+    [[maybe_unused]] auto operator()(const std::vector<T>& a, const std::vector<T>& b, S tol)
+        -> /**! Size of subsequences */ I
+    {
+        return this->operator()(a.data(), a.size(), b.data(), b.size(), tol);
+    }
+
+    //
+    // Returns the l^1 distance between subsequences, or Inf if at least one of
+    // them is empty.
+    //
+    // \return l^1 distance between subsequences.
+    //
+    auto distance() -> S
+    {
+        std::lock_guard<std::mutex> lock(m_);
+        return distance_;
+    }
+
+    //
+    // Returns the l^\inf distance between subsequences, or Inf if at least one
+    // of them is empty.
+    //
+    // \return l^\inf distance between subsequences.
+    //
+    auto inf_norm_distance() -> S
+    {
+        std::lock_guard<std::mutex> lock(m_);
+        return inf_norm_;
+    }
+
+    //
+    // Returns the indices of the elements of the subsequences in their
+    // original sequences.
+    //
+    // Let a, b denote the original sequences, and sseq_a, sseq_b denote
+    // subsequences computed by functor `clss`.  Write:
+    //
+    // `auto [a_ids, b_ids] = clss::subseqs_ids();`
+    //
+    // Then:
+    //
+    // a) For 0 <= i < sseq_a.size(), sseq_a[i] == a[a_ids[i]];
+    //
+    // b) For 0 <= j < sseq_b.size(), sseq_b[i] == b[b_ids[j]].
+    //
+    // \return std::pair of std::vector containing indices of subsequences'
+    // elements as they appear in the original sequences.
+    //
+    auto subseqs_ids() -> std::pair<std::vector<S>, std::vector<S>>
+    {
+        std::lock_guard<std::mutex> lock(m_);
+        return std::make_pair(sseq_a_ids_, sseq_b_ids_);
+    }
+
+    //
+    // Returns two subsequences satisfying properties (1), (2) and (3)
+    // of the functor description.
+    //
+    // \return std::pair of std::vector containing subsequences.
+    //
+    auto subseqs() -> std::pair<std::vector<S>, std::vector<S>>
+    {
+        std::lock_guard<std::mutex> lock(m_);
+        return std::make_pair(sseq_a_, sseq_b_);
+    }
+
+    //
+    // Returns the number of elements of the subsequences.
+    //
+    // \return number of elements of the subsequences.
+    //
+    auto subseqs_size() -> I
+    {
+        std::lock_guard<std::mutex> lock(m_);
+        return sseqs_size_;
+    }
+
+    ///
+    /// For debugging
+    ///
+
+    //
+    // Prints internal information for debugging purposes.
+    //
+    // \return std::string with debug information.
+    //
+    auto print_debug_str() -> std::string
+    {
+        std::ostringstream os;
+        return print_debug(os).str();
+    }
+
+    //
+    // Prints internal information for debugging purposes.
+    //
+    // \param os: reference to a variable of a type that derives from
+    // std::ostream, in which debug information is meant to be appended to.
+    //
+    // \return *reference* to input parameter `os`, for convenience.
+    //
+    // See `clss::print_debug_str` for an example of usage.
+    //
+    template <typename K = std::ostringstream,
+              typename = typename std::enable_if<std::is_base_of_v<std::ostream, K>>::type>
+    [[maybe_unused]] auto print_debug(K& os) -> K&
+    {
+        std::lock_guard<std::mutex> lock(m_);
+
+        auto a = seq_a_.data();
+        auto b = seq_b_.data();
+
+        const auto default_precision{os.precision()};
+        const auto digits
+            = static_cast<I>(tol_ > S(0) ? std::ceil(-std::min(std::log10(tol_), S(0))) + 2
+                                         : std::numeric_limits<T>::max_digits10);
+        os << std::fixed << std::setprecision(digits);
+
+        auto print_input_sequences = [&os](auto& a, auto a_size, auto& b, auto b_size) {
+            os << ">>> Input: \n";
+
+            os << ":: :: a = {";
+            for(I i = 0; i < a_size; ++i)
+            {
+                os << a[i];
+                if(i != a_size - 1)
+                {
+                    os << ", ";
+                }
+            }
+            os << "}\n\n";
+
+            os << ":: :: b = {";
+            for(I i = 0; i < b_size; ++i)
+            {
+                os << b[i];
+                if(i != b_size - 1)
+                {
+                    os << ", ";
+                }
+            }
+            os << "}\n\n";
+        };
+
+        os << ">>>>>>>>>>>>\n";
+        os << ":: :: closest_largest_subsequences::print_debug()\n\n" << std::flush;
+        print_input_sequences(a, size_a_, b, size_b_);
+        os << ":: :: tol = " << tol_ << std::endl << std::endl;
+
+        os << "++++++++++++\n";
+        os << ":: :: Subsequences sub_a, sub_b have distance: " << distance_
+           << ", size: " << sseqs_size_ << ", and ||sub_a - sub_b||_inf = " << inf_norm_ << std::endl
+           << std::endl;
+
+        print_extract_subsequences(os);
+        os << "<<<<<<<<<<<<\n" << std::flush;
+
+        // Restore defaults
+        os << std::setprecision(default_precision);
+
+        return os;
+    }
+
+private:
+    S tol_{};
+    I sseqs_size_{};
+    S distance_ = std::numeric_limits<S>::infinity();
+    S inf_norm_ = std::numeric_limits<S>::infinity();
+    I size_a_{};
+    I size_b_{};
+    std::vector<T> seq_a_{};
+    std::vector<T> seq_b_{};
+    std::vector<T> sseq_a_{};
+    std::vector<T> sseq_b_{};
+    std::vector<T> sseq_a_ids_{};
+    std::vector<T> sseq_b_ids_{};
+    std::vector<S> memo_distances_{};
+    std::vector<I> memo_sizes_{};
+    std::vector<I> memo_next_{};
+    std::mutex m_;
+
+    void clear()
+    {
+        tol_ = {};
+        sseqs_size_ = {};
+        distance_ = std::numeric_limits<T>::infinity();
+        inf_norm_ = std::numeric_limits<S>::infinity();
+        size_a_ = {};
+        size_b_ = {};
+        seq_a_ = {};
+        seq_b_ = {};
+        sseq_a_ = {};
+        sseq_b_ = {};
+        sseq_a_ids_ = {};
+        sseq_b_ids_ = {};
+        memo_distances_ = {};
+        memo_sizes_ = {};
+        memo_next_ = {};
+    }
+
+    /// Recursive implementation with memoization
+    auto clss_implr(T const* a, I sa, T const* b, I sb)
+        -> std::tuple</* acc distance */ S, /* size */ I, /* next */ I>
+    {
+        //
+        // Base case: at least one of the sequences is empty
+        //
+        if(!in_range(sa, sb))
+        {
+            return std::make_tuple(std::numeric_limits<S>::infinity(), I(0), I(-1));
+        }
+
+        //
+        // If `dist`, `size` and `next_index` have already been computed for this pair of `sa`, `sb` return
+        //
+        auto [dist, size, _] = memo(sa, sb);
+        I next_index = I(-1);
+
+        if(memo_valid(dist, size))
+        {
+            // Make next entry point to this one
+            next_index = ij2index(sa, sb);
+
+            return std::make_tuple(dist, size, next_index);
+        }
+
+        //
+        // Otherwise, compute new `dist`, `size` and `next_index`
+        //
+
+        // Initialize local vars
+        dist = std::numeric_limits<S>::infinity();
+        size = I(0);
+        // Compare current optimum (dist, size) with candidate optimum (d, s), and update if necessary
+        auto do_update = [](S d, I s, I nindex, S& dist, I& size, I& next_index) -> bool {
+            bool update = false;
+            if(size < s)
+            {
+                dist = d;
+                size = s;
+                next_index = nindex;
+                update = true;
+            }
+            else if(size == s)
+            {
+                if(dist > d)
+                {
+                    dist = d;
+                    next_index = nindex;
+                    update = true;
+                }
+            }
+
+            return update;
+        };
+        [[maybe_unused]] bool update = false;
+
+        // Case 1: a[0] .==. b[0], try to match next element of sequence `a` with next element of sequence `b`
+        if(equiv(a[0], b[0]))
+        {
+            auto [d, s, nindex] = clss_implr(a + I(1), sa - I(1), b + I(1), sb - I(1));
+            if(d == std::numeric_limits<S>::infinity())
+            {
+                dist = std::abs(a[0] - b[0]);
+                size = I(1);
+                next_index = ij2index(sa, sb);
+                update = true;
+            }
+            else
+            {
+                d += std::abs(a[0] - b[0]);
+                ++s;
+                update = do_update(d, s, nindex, dist, size, next_index);
+            }
+        }
+
+        // Case 2: try to match next element of sequence `a` with current element of sequence `b`
+        {
+            auto [d, s, nindex] = clss_implr(a + I(1), sa - I(1), b, sb);
+            update = do_update(d, s, nindex, dist, size, next_index);
+        }
+
+        // Case 3: try to match current element of sequence `a` with next element of sequence `b`
+        {
+            auto [d, s, nindex] = clss_implr(a, sa, b + I(1), sb - I(1));
+            update = do_update(d, s, nindex, dist, size, next_index);
+        }
+
+        // Save best results from 3 cases
+        memo_dist(sa, sb) = dist;
+        memo_size(sa, sb) = size;
+        memo_next(sa, sb) = next_index;
+
+        // Make next entry point to this one
+        next_index = ij2index(sa, sb);
+
+        return std::make_tuple(dist, size, next_index);
+    }
+
+    auto extract_subsequences(T const* a, I size_a, T const* b, I size_b)
+        -> /* || sseq_a_ - sseq_b_ ||_inf */ S
+    {
+        S inf_norm = std::numeric_limits<S>::infinity();
+        I sa = size_a - I(1);
+        I sb = size_b - I(1);
+
+        I index = ij2index(sa, sb);
+        if(!in_range(index) || (sseqs_size_ == I(0)))
+        {
+            return inf_norm;
+        }
+
+        I next_index = index;
+        inf_norm = static_cast<S>(0);
+        do
+        {
+            index = next_index;
+            next_index = memo_next(index);
+            next_index = in_range(next_index) ? next_index : index;
+
+            I ia, ib;
+            I si = memo_size(index);
+            I nsi = memo_size(next_index);
+            if((nsi < si) || (index == next_index))
+            {
+                auto [ja, jb] = index2ij(index);
+
+                ia = sa - ja;
+                sseq_a_ids_.push_back(ia);
+                sseq_a_.push_back(a[ia]);
+
+                ib = sb - jb;
+                sseq_b_ids_.push_back(ib);
+                sseq_b_.push_back(b[ib]);
+
+                S norm = std::abs(a[ia] - b[ib]);
+                inf_norm = std::max(inf_norm, norm);
+            }
+        } while((index != next_index) && in_range(index));
+
+        return inf_norm;
+    }
+
+    template <typename K = std::ostream>
+    void print_extract_subsequences(K&& os)
+    {
+        os << ">>> Traversing:";
+        I sa = size_a_ - I(1);
+        I sb = size_b_ - I(1);
+        I index = ij2index(sa, sb);
+        if(!in_range(index) || (sseqs_size_ == I(0)))
+        {
+            os << " nothing to print\n";
+            return;
+        }
+        os << std::endl;
+
+        I next_index = index, i = I(0);
+        do
+        {
+            index = next_index;
+            next_index = memo_next(index);
+            next_index = in_range(next_index) ? next_index : index;
+
+            I ia, ib;
+            I si = memo_size(index);
+            I nsi = memo_size(next_index);
+            if((nsi < si) || (index == next_index))
+            {
+                auto [ja, jb] = index2ij(index);
+
+                ia = sa - ja;
+                ib = sb - jb;
+
+                os << ""
+                   << ":: :: Indices: (" << ia << ", " << ib << ") :: Elements: (" << sseq_a_[i]
+                   << ", " << sseq_b_[i] << ") :: (acc dist = " << memo_dist(ja, jb)
+                   << ", size = " << memo_size(ja, jb) << ")\n";
+                ++i;
+            }
+        } while((index != next_index) && in_range(index));
+
+        return;
+    }
+
+    ///
+    /// Helper functions
+    ///
+
+    /// lhs, rhs are "equivalent" (symbolyc notation: lhs .=. rhs)
+    /// when |lhs - rhs| <= tol.
+    ///
+    /// This is not a true equivalence relation.
+    bool equiv(T lhs, T rhs) const
+    {
+        if(std::abs(lhs - rhs) <= tol_)
+        {
+            return true;
+        }
+
+        return false;
+    }
+
+    bool in_range(I i, I j) const
+    {
+        bool in_range = false;
+
+        if((i >= 0) && (i < size_a_) && (j >= I(0)) && (j < size_b_))
+        {
+            in_range = true;
+        }
+
+        return in_range;
+    }
+
+    bool in_range(I index) const
+    {
+        bool in_range = false;
+
+        I upper_bound = size_a_ * size_b_;
+        if((index >= I(0)) && (index < upper_bound))
+        {
+            in_range = true;
+        }
+
+        return in_range;
+    }
+
+    auto memo(I i, I j) -> std::tuple<S, I, I> const
+    {
+        auto d = memo_dist(i, j);
+        auto s = memo_size(i, j);
+        auto n = memo_next(i, j);
+
+        return std::make_tuple(d, s, n);
+    }
+
+    S memo_dist(I i, I j) const&&
+    {
+        auto x = memo_distances_[ij2index(i, j)];
+        return x;
+    }
+
+    S& memo_dist(I i, I j) &
+    {
+        auto& x = memo_distances_[ij2index(i, j)];
+        return x;
+    }
+
+    I memo_size(I i, I j) const&&
+    {
+        auto x = memo_sizes_[ij2index(i, j)];
+        return x;
+    }
+
+    I& memo_size(I i, I j) &
+    {
+        auto& x = memo_sizes_[ij2index(i, j)];
+        return x;
+    }
+
+    I memo_size(I index) const&&
+    {
+        auto x = memo_sizes_[index];
+        return x;
+    }
+
+    I& memo_size(I index) &
+    {
+        auto& x = memo_sizes_[index];
+        return x;
+    }
+
+    I memo_next(I i, I j) const&&
+    {
+        auto x = memo_next_[ij2index(i, j)];
+        return x;
+    }
+
+    I& memo_next(I i, I j) &
+    {
+        auto& x = memo_next_[ij2index(i, j)];
+        return x;
+    }
+
+    I memo_next(I index) const&&
+    {
+        auto x = memo_next_[index];
+        return x;
+    }
+
+    I& memo_next(I index) &
+    {
+        auto& x = memo_next_[index];
+        return x;
+    }
+
+    bool memo_valid(S d, I s) const
+    {
+        bool valid = true;
+        if((d == S(-1)) || (s == I(-1)))
+        {
+            valid = false;
+        }
+
+        return valid;
+    }
+
+    auto ij2index(I i, I j) -> I const
+    {
+        return i + size_a_ * j;
+    }
+
+    auto index2ij(I index) -> std::pair<I, I> const
+    {
+        I i = index % size_a_;
+        I j = (index - i) / size_a_;
+        return std::make_pair(i, j);
+    }
+};

From 7cd5433e738cc029f7d6fe75eebc3cc44e0d1f2d Mon Sep 17 00:00:00 2001
From: Julio Machado Silva <161654951+jmachado-amd@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:43:30 -0700
Subject: [PATCH 2/2] Fix tolerance for HEGVDX tests (#879)

As part of PR #810, the tolerances for `sygvdx` and `hegvdx` tests were
set too tight, which made 3 of `hegvdx` daily lapack tests fail.

This PR changes the tolerance to allow all tests to pass on all architectures.

(cherry picked from commit adeb5346ae67d8e13fedd0917f7ad987e5eedb57)
---
 clients/common/lapack/testing_sygvdx_hegvdx.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clients/common/lapack/testing_sygvdx_hegvdx.hpp b/clients/common/lapack/testing_sygvdx_hegvdx.hpp
index b5e631ae9..755bc1b83 100644
--- a/clients/common/lapack/testing_sygvdx_hegvdx.hpp
+++ b/clients/common/lapack/testing_sygvdx_hegvdx.hpp
@@ -1094,9 +1094,9 @@ void testing_sygvdx_hegvdx(Arguments& argus)
     }
 
     // validate results for rocsolver-test
-    // using 4 * n * machine_precision as tolerance
+    // using 5 * n * machine_precision as tolerance
     if(argus.unit_check)
-        ROCSOLVER_TEST_CHECK(T, max_error, 4 * n);
+        ROCSOLVER_TEST_CHECK(T, max_error, 5 * n);
 
     // output results for rocsolver-bench
     if(argus.timing)