Fix stein initial eigenvectors' choices (#714)

Improve choice of starting eigenvectors in STEIN This commit introduces a pseudorandom number generator that is subsequently used to generate more robust initial eigenvectors in `roclapack_stein`.
ROCm · Aug 1, 2024 · b6820dc · b6820dc
1 parent ec88119
commit b6820dc
Show file tree

Hide file tree

Showing 2 changed files with 299 additions and 3 deletions.
diff --git a/library/src/auxiliary/rocauxiliary_stein.hpp b/library/src/auxiliary/rocauxiliary_stein.hpp
@@ -35,6 +35,7 @@
 #include "lapack_device_functions.hpp"
 #include "rocblas.hpp"
 #include "rocsolver/rocsolver.h"
+#include "rocsolver_prng.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
@@ -112,7 +113,7 @@ __device__ void run_stein(const int tid,
                           S ssfmin)
 {
     __shared__ rocblas_int _info;
-    rocblas_int i, j, j1 = 0, b1, bn, blksize, gpind;
+    rocblas_int i, j, j1 = 0, b1, bn, blksize, gpind, seed;
     S scl, onenrm, ortol, stpcrt, xj, xjm;
 
     // zero info and ifail
@@ -122,6 +123,9 @@ __device__ void run_stein(const int tid,
         for(i = tid; i < nev; i += MAX_THDS)
             ifail[i] = 0;
 
+    auto normalize = [](rocblas_int value, rocblas_int max) -> S {
+        return static_cast<S>(static_cast<double>(value) / max);
+    };
     // iterate over submatrix blocks
     for(rocblas_int nblk = 0; nblk < iblock[nev - 1]; nblk++)
     {
@@ -170,9 +174,21 @@ __device__ void run_stein(const int tid,
                 rocblas_int nrmchk = 0;
 
                 // initialize starting eigenvector
-                // TODO: how to make it random?
+
+                // deterministic seed, varies with thread ids and with each new computed eigenvalue
+                seed = tid + (j + nblk * nev) * MAX_THDS + 1;
+
+                // rocSOLVER's prng can be initialized with one or two seeds (better results are achieved if
+                // two, independent, seeds are used).  The two seeds used here are not independent, they just spread
+                // the first pseudo-random number each thread computes more evenly.
+                rocsolver_int_prng<rocblas_int> prng(
+                    seed, rocsolver_int_prng<rocblas_int>::max() + 1 - seed);
+
                 for(i = tid; i < blksize; i += MAX_THDS)
-                    work[i] = (i == j - j1 ? S(1) : S(-1) / (blksize - 1));
+                {
+                    // work[i] lies on the interval [0, 1]
+                    work[i] = normalize(prng(), prng.max());
+                }
 
                 // copy the matrix so it won't be destroyed by factorization
                 for(i = tid; i < blksize - 1; i += MAX_THDS)

diff --git a/library/src/include/rocsolver_prng.hpp b/library/src/include/rocsolver_prng.hpp
@@ -0,0 +1,280 @@
+/* **************************************************************************
+ * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include <type_traits>
+
+#include "rocblas_utility.hpp"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+namespace detail
+{
+///
+/// @brief Pseudorandom number generator
+///
+/// Generates pseudorandom numbers 0 <= X < 2^31 - 1,
+/// U = X/(2^31 - 1) in [0, 1) using the sequence
+///
+/// X_n = Y_n - Z_n, (*)
+///
+/// where Y_n, Z_n are the following linear congruential generators
+/// - Y_n = 48271 * Y_{n-1} mod (2^31 - 1);
+/// - Z_n = 40692 * Z_{n-1} mod (2^31 - 249).
+///
+/// Sequence X_n (*) has the following properties:
+/// - it can be computed with 32-bit signed integer arithmetic;
+/// - its period length size (2^31 - 2)(2^31 - 250)/62 ~ 2^56
+///   renders it useful for generating significantly more than 2^32
+///   pseudorandom numbers per run;
+/// - it achieves good ratings on the spectral test.
+///
+/// This generator appears on Eq. 3.4.4-(38) of Donald Knuth, The Art
+/// of Computer Programming Volume 2 -- Seminumerical Algorithms,
+/// 3rd ed. See book and references therein for facts about Xn (*),
+/// (in particular, see entries 20, 21 and 24 in Table 1, pg. 106,
+/// and the discussion found in Subsection 3.3.4 near Eq. 3.4.4-(38)).
+///
+/// \tparam I Type used for input and output of numbers Xn, Yn, Zn.
+///         Must be an integer and, at least, 32-bit wide.
+///
+template <typename I, typename = typename std::enable_if<std::is_integral<I>::value && sizeof(I) >= 4>::type>
+struct pseudorandom_number_generator
+{
+    static constexpr std::int32_t m_m31_mXY
+        = (1LL << 31) - 1; ///< Modulus of Xn and Yn, Mersenne prime M_31 = 2^31 - 1
+    static constexpr std::int32_t m_mZ = (1LL << 31) - 249; ///< Modulus of Zn sequence, another prime
+
+    ///
+    /// Range check Y0, such that 0 < Y0 < m_m31_mXY = 2^31 - 1.
+    ///
+    /// The number Y' = range_check_Y(Y0) yields next_Y(Y') != 0,
+    /// for all Y0. Hence, this method is meant to be used only
+    /// when seeding the Zn generator.
+    ///
+    /// \param Z0 Input: number to check, output: 0 < Y' < 2^31 - 1.
+    ///
+    /// \return Number Y' lying strictly between 0 and m_mZ = 2^31 - 1.
+    ///
+    template <typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto range_check_Y(K&& Y0) -> I
+    {
+        Y0 = std::max(Y0 % static_cast<I>(m_m31_mXY), static_cast<I>(1));
+        return Y0;
+    }
+
+    ///
+    /// Range check Z0, such that 0 < Z0 < m_mZ = 2^31 - 249.
+    ///
+    /// The number Z' = range_check_Z(Z0) yields next_Z(Z') != 0,
+    /// for all Z0. Hence, this method is meant to be used only
+    /// when seeding the Zn generator.
+    ///
+    /// \param Z0 Input: number to check, output: 0 < Z' < 2^31 - 249.
+    ///
+    /// \return Number Z' lying strictly between 0 and m_mZ = 2^31 - 249.
+    ///
+    template <typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto range_check_Z(K&& Z0) -> I
+    {
+        Z0 = std::max(Z0 % static_cast<I>(m_mZ), static_cast<I>(1));
+        return Z0;
+    }
+
+    ///
+    /// Iterates Y, Z and computes U = (Y - Z)/(2^31 - 1) \in [0., 1.).
+    ///
+    /// \tparam S Output type, typically a floating point number.
+    ///
+    /// \param Y Input: iterate n-1 of Yn, output: next_Y(Y).
+    ///
+    /// \param Z [Optional] Input: iterate n-1 of Zn, output: next_Z(Z).
+    //
+    /// \return Number (Y - Z)/(2^31 - 1) cast to type S.
+    ///
+    template <typename S,
+              typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto uniform01(K&& Y, K&& Z = 0) -> S
+    {
+        constexpr double range = static_cast<double>(m_m31_mXY);
+        I Xnext = next_X(Y, Z);
+        double U = static_cast<double>(Xnext) / range;
+        return static_cast<S>(U);
+    }
+
+    ///
+    /// Computes next iterate of sequence Xn.
+    ///
+    /// \param Y Input: iterate n-1 of Yn, output: next_Y(Y).
+    ///
+    /// \param Z [Optional] Input: iterate n-1 of Zn, output: next_Z(Z).
+    ///
+    /// \return Next iterate of Xn, 0 <= Xnext < m_m31_mXY = 2^31 - 1.
+    ///
+    template <typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto next_X(K&& Y, K&& Z = 0) -> I
+    {
+        std::int32_t Xnext, Ynext, Znext, sgnX;
+
+        Ynext = static_cast<std::int32_t>(next_Y(Y));
+        Znext = static_cast<std::int32_t>(next_Z(Z));
+
+        Xnext = Ynext - Znext;
+        sgnX = Xnext < 0 ? -1 : 1;
+        Xnext += std::max(-sgnX, 0) * m_m31_mXY;
+
+        return static_cast<I>(Xnext);
+    }
+
+    ///
+    /// Computes next iterate of sequence Yn.
+    ///
+    /// Facts:
+    /// - Yn has period m_m31_mXY = 2^31 - 2;
+    /// - next_Y(0) == 0;
+    /// - if 0 < Y' < 2^31 - 1, next_Y(Y') != 0,
+    ///   thus, for all integers k, next_Y(range_check_Y(k)) != 0.
+    ///
+    /// \param Y Input: iterate n-1 of Yn, output: next_Y(Y).
+    ///
+    /// \return Next iterate of Yn, 0 <= Ynext < m_m31_mXY = 2^31 - 1.
+    ///
+    template <typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto next_Y(K&& Y) -> I
+    {
+        std::int32_t Ynext = static_cast<std::int32_t>(Y), sgnY;
+
+        constexpr std::int32_t ay = 48271;
+        constexpr std::int32_t qy = 44488; // qy = floor(m_m31_mXY/ay)
+        constexpr std::int32_t ry = 3399; // ry = m_m31_mXY % ay
+
+        // Ynext = (ay * Y) % m_m31_mXY;
+        Ynext = ay * (Ynext % qy) - ry * static_cast<std::int32_t>(Ynext / qy);
+        sgnY = Ynext < 0 ? -1 : 1;
+        Ynext += std::max(-sgnY, 0) * m_m31_mXY;
+
+        Y = static_cast<I>(Ynext);
+        return Y;
+    }
+
+    ///
+    /// Computes next iterate of sequence Zn.
+    ///
+    /// Facts:
+    /// - Zn has period m_Z = 2^31 - 250;
+    /// - next_Z(0) == 0;
+    /// - if 0 < Z' < 2^31 - 249, next_Z(Z') != 0,
+    ///   thus, for all integers k, next_Z(range_check_Z(k)) != 0.
+    ///
+    /// \param Z Input: iterate n-1 of Zn, output: next_Z(Z).
+    ///
+    /// \return Next iterate of Zn, 0 <= Znext < m_Z = 2^31 - 249.
+    ///
+    template <typename K = I,
+              typename = typename std::enable_if<std::is_same<std::decay_t<K>, I>::value>::type>
+    __device__ __host__ static auto next_Z(K&& Z) -> I
+    {
+        std::int32_t Znext = static_cast<std::int32_t>(Z), sgnZ;
+
+        constexpr std::int32_t az = 40692;
+        constexpr std::int32_t qz = 52774; // qz = floor(m_mZ/az)
+        constexpr std::int32_t rz = 3791; // rz = m_mZ % az
+
+        // Znext = (az * Z) % m_mZ;
+        Znext = az * (Znext % qz) - rz * static_cast<std::int32_t>(Znext / qz);
+        sgnZ = Znext < 0 ? -1 : 1;
+        Znext += std::max(-sgnZ, 0) * m_mZ;
+
+        Z = static_cast<I>(Znext);
+        return Z;
+    }
+};
+} /// namespace detail
+
+/// \class rocSOLVER wrapper class for integer, pseudo-random, number generation
+///
+/// \brief Produces pseudo-random integer values distributed on the interval
+/// [0, rocsolver_int_prng<T>::max()].  If T is at least 32-bit wide,
+/// then generated values are uniformly distributed.
+///
+/// \tparam T - type of generated values, usually an integral type.
+///
+/// \sa detail::pseudorandom_number_generator for more details.
+///
+template <class T = std::int32_t>
+class rocsolver_int_prng
+{
+public:
+    using rocsolver_prng_impl_t = typename detail::pseudorandom_number_generator<T>;
+
+    /// Constructor.
+    __device__ __host__ rocsolver_int_prng(T Y0, T Z0 = 0)
+    {
+        m_Y = rocsolver_prng_impl_t::range_check_Y(static_cast<std::int32_t>(Y0));
+        if(Z0 == 0)
+        {
+            m_Z = m_Y;
+        }
+        else
+        {
+            m_Z = rocsolver_prng_impl_t::range_check_Z(static_cast<std::int32_t>(Z0));
+        }
+    }
+
+    /// Default destructor.
+    __device__ __host__ ~rocsolver_int_prng() = default;
+
+    /// Returns the smallest possible value that can be generated.
+    static constexpr __device__ __host__ auto min() -> T
+    {
+        return static_cast<T>(0);
+    }
+
+    /// Returns the largest possible value that can be generated.
+    static constexpr __device__ __host__ auto max() -> T
+    {
+        return static_cast<T>(rocsolver_prng_impl_t::m_m31_mXY - 1);
+    }
+
+    /// Generates a new pseudo-random integer.
+    __device__ __host__ auto operator()(void) -> T
+    {
+        auto output = rocsolver_prng_impl_t::next_X(m_Y, m_Z);
+        return static_cast<T>(output);
+    }
+
+private:
+    std::int32_t m_Y{}, m_Z{};
+};
+
+ROCSOLVER_END_NAMESPACE