unified final reduction kernel for asum, dot and nrm2 (#2850)

* have unified part-2 reduction kernel * remove include headers * Add include reduction.hpp
ROCm · Jan 28, 2025 · c7c0ac4 · c7c0ac4
1 parent ac2f87a
commit c7c0ac4
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 98 deletions.
diff --git a/library/src/blas1/rocblas_asum_nrm2_kernels.hpp b/library/src/blas1/rocblas_asum_nrm2_kernels.hpp
@@ -112,40 +112,6 @@ rocblas_reduction_kernel_part1(rocblas_int    n,
 #endif
 }
 
-// kernel 2 is used from non-strided reduction_batched see include file
-// kernel 2 gathers all the partial results in workspace and finishes the final reduction;
-// number of threads (NB) loop blocks
-template <int NB, int WIN, typename FINALIZE, typename To, typename Tr>
-ROCBLAS_KERNEL(NB)
-rocblas_reduction_kernel_part2(rocblas_int nblocks, To* workspace, Tr* result)
-{
-    To sum = 0;
-
-    size_t offset = size_t(blockIdx.x) * nblocks;
-    workspace += offset;
-
-    int inc = NB * WIN;
-
-    int i         = threadIdx.x * WIN;
-    int remainder = nblocks & (WIN - 1);
-    int end       = nblocks - remainder;
-    for(; i < end; i += inc) // cover all sums as 1 block
-    {
-        for(int j = 0; j < WIN; j++)
-            sum += workspace[i + j];
-    }
-    if(threadIdx.x < remainder)
-    {
-        sum += workspace[nblocks - 1 - threadIdx.x];
-    }
-
-    sum = rocblas_dot_block_reduce<NB, To>(sum);
-
-    // Store result on device or in workspace
-    if(threadIdx.x == 0)
-        result[blockIdx.x] = Tr(FINALIZE{}(sum));
-}
-
 /*! \brief
 
     \details

diff --git a/library/src/blas1/rocblas_dot.hpp b/library/src/blas1/rocblas_dot.hpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 #pragma once
 
 #include "handle.hpp"
+#include "reduction.hpp"
 #include "rocblas.h"
 #include "rocblas_reduction.hpp"
 

diff --git a/library/src/blas1/rocblas_dot_kernels.hpp b/library/src/blas1/rocblas_dot_kernels.hpp
@@ -305,35 +305,6 @@ rocblas_dot_kernel_magsq(rocblas_int n,
 #endif
 }
 
-template <int NB, int WIN, typename V, typename T = V>
-ROCBLAS_KERNEL(NB)
-rocblas_dot_kernel_reduce(int n_sums, V* __restrict__ in, T* __restrict__ out)
-{
-    V sum = 0;
-
-    size_t offset = size_t(blockIdx.x) * n_sums;
-    in += offset;
-
-    int inc = NB * WIN;
-
-    int i         = threadIdx.x * WIN;
-    int remainder = n_sums % WIN;
-    int end       = n_sums - remainder;
-    for(; i < end; i += inc) // cover all sums as 1 block
-    {
-        for(int j = 0; j < WIN; j++)
-            sum += in[i + j];
-    }
-    if(threadIdx.x < remainder)
-    {
-        sum += in[n_sums - 1 - threadIdx.x];
-    }
-
-    sum = rocblas_dot_block_reduce<NB>(sum);
-    if(threadIdx.x == 0)
-        out[blockIdx.x] = T(sum);
-}
-
 template <typename API_INT, int NB_X, int NB_Y, bool CONJ, typename V, typename T, typename U>
 ROCBLAS_KERNEL(NB_X* NB_Y)
 rocblas_dot_batched_4_kernel(rocblas_int n,
@@ -637,14 +608,15 @@ rocblas_status rocblas_internal_dot_launcher(rocblas_handle __restrict__ handle,
                               workspace,
                               output);
 
-        ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<DOT_NB, DOT_NELEM>),
-                              dim3(batch_count),
-                              threads,
-                              0,
-                              handle->get_stream(),
-                              blocks,
-                              workspace,
-                              output);
+        ROCBLAS_LAUNCH_KERNEL(
+            (rocblas_reduction_kernel_part2<DOT_NB, DOT_NELEM, rocblas_finalize_identity>),
+            dim3(batch_count),
+            threads,
+            0,
+            handle->get_stream(),
+            blocks,
+            workspace,
+            output);
 
         if(handle->pointer_mode == rocblas_pointer_mode_host)
         {
@@ -733,14 +705,15 @@ rocblas_status rocblas_internal_dot_launcher(rocblas_handle __restrict__ handle,
         }
 
         if(blocks > 1) // if single block first kernel did all work
-            ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
-                                  dim3(batch_count),
-                                  threads,
-                                  0,
-                                  handle->get_stream(),
-                                  blocks,
-                                  workspace,
-                                  output);
+            ROCBLAS_LAUNCH_KERNEL(
+                (rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
+                dim3(batch_count),
+                threads,
+                0,
+                handle->get_stream(),
+                blocks,
+                workspace,
+                output);
 
         if(handle->pointer_mode == rocblas_pointer_mode_host)
         {

diff --git a/library/src/blas1/rocblas_reduction.hpp b/library/src/blas1/rocblas_reduction.hpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -249,6 +249,38 @@ size_t rocblas_reduction_workspace_non_chunked_size(API_INT n, API_INT batch_cou
     return sizeof(To) * (blocks + 1) * batch_count;
 }
 
+/*! \brief rocblas_reduction_kernel_part2
+    gathers all the partial results in workspace and finishes the final reduction.
+    ********************************************************************/
+template <int NB, int WIN, typename FINALIZE, typename V, typename T = V>
+ROCBLAS_KERNEL(NB)
+rocblas_reduction_kernel_part2(int n_sums, V* __restrict__ in, T* __restrict__ out)
+{
+    V sum = 0;
+
+    size_t offset = size_t(blockIdx.x) * n_sums;
+    in += offset;
+
+    int inc = NB * WIN;
+
+    int i         = threadIdx.x * WIN;
+    int remainder = n_sums % WIN;
+    int end       = n_sums - remainder;
+    for(; i < end; i += inc) // cover all sums as 1 block
+    {
+        for(int j = 0; j < WIN; j++)
+            sum += in[i + j];
+    }
+    if(threadIdx.x < remainder)
+    {
+        sum += in[n_sums - 1 - threadIdx.x];
+    }
+
+    sum = rocblas_dot_block_reduce<NB>(sum);
+    if(threadIdx.x == 0)
+        out[blockIdx.x] = T(FINALIZE{}(sum));
+}
+
 /*! \brief rocblas_reduction_batched_kernel_workspace_size
     Work area for reductions where full reduction to single value occurs
     Additional passes add to workspace requirement for ILP64 subdivisions but size limited to chunks

diff --git a/library/src/src64/blas1/rocblas_dot_kernels_64.cpp b/library/src/src64/blas1/rocblas_dot_kernels_64.cpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -137,14 +137,15 @@ rocblas_status rocblas_internal_dot_launcher_64(rocblas_handle __restrict__ hand
             // reduce n partitions within batch chunk
 
             // sum partial_results to results always needed if only to down convert
-            ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
-                                  dim3(batch_count),
-                                  dim3(NB),
-                                  0,
-                                  handle->get_stream(),
-                                  n_passes,
-                                  partial_results,
-                                  output);
+            ROCBLAS_LAUNCH_KERNEL(
+                (rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
+                dim3(batch_count),
+                dim3(NB),
+                0,
+                handle->get_stream(),
+                n_passes,
+                partial_results,
+                output);
 
             if(handle->pointer_mode == rocblas_pointer_mode_host)
             {
@@ -230,14 +231,15 @@ rocblas_status rocblas_internal_dot_launcher_64(rocblas_handle __restrict__ hand
 
                 // reduce n partitions within batch chunk
                 // sum partial_results to results always needed as may down convert
-                ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
-                                      dim3(batch_count),
-                                      dim3(NB),
-                                      0,
-                                      handle->get_stream(),
-                                      n_passes,
-                                      partial_results,
-                                      output);
+                ROCBLAS_LAUNCH_KERNEL(
+                    (rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
+                    dim3(batch_count),
+                    dim3(NB),
+                    0,
+                    handle->get_stream(),
+                    n_passes,
+                    partial_results,
+                    output);
 
                 if(handle->pointer_mode == rocblas_pointer_mode_host)
                 {