Skip to content

Commit

Permalink
unified final reduction kernel for asum, dot and nrm2 (#2850)
Browse files Browse the repository at this point in the history
* have unified part-2 reduction kernel

* remove include headers

* Add include reduction.hpp
  • Loading branch information
NaveenElumalaiAMD authored Jan 28, 2025
1 parent ac2f87a commit c7c0ac4
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 98 deletions.
34 changes: 0 additions & 34 deletions library/src/blas1/rocblas_asum_nrm2_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,40 +112,6 @@ rocblas_reduction_kernel_part1(rocblas_int n,
#endif
}

// kernel 2 is used from non-strided reduction_batched see include file
// kernel 2 gathers all the partial results in workspace and finishes the final reduction;
// number of threads (NB) loop blocks
template <int NB, int WIN, typename FINALIZE, typename To, typename Tr>
ROCBLAS_KERNEL(NB)
rocblas_reduction_kernel_part2(rocblas_int nblocks, To* workspace, Tr* result)
{
To sum = 0;

size_t offset = size_t(blockIdx.x) * nblocks;
workspace += offset;

int inc = NB * WIN;

int i = threadIdx.x * WIN;
int remainder = nblocks & (WIN - 1);
int end = nblocks - remainder;
for(; i < end; i += inc) // cover all sums as 1 block
{
for(int j = 0; j < WIN; j++)
sum += workspace[i + j];
}
if(threadIdx.x < remainder)
{
sum += workspace[nblocks - 1 - threadIdx.x];
}

sum = rocblas_dot_block_reduce<NB, To>(sum);

// Store result on device or in workspace
if(threadIdx.x == 0)
result[blockIdx.x] = Tr(FINALIZE{}(sum));
}

/*! \brief
\details
Expand Down
3 changes: 2 additions & 1 deletion library/src/blas1/rocblas_dot.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -23,6 +23,7 @@
#pragma once

#include "handle.hpp"
#include "reduction.hpp"
#include "rocblas.h"
#include "rocblas_reduction.hpp"

Expand Down
63 changes: 18 additions & 45 deletions library/src/blas1/rocblas_dot_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,35 +305,6 @@ rocblas_dot_kernel_magsq(rocblas_int n,
#endif
}

template <int NB, int WIN, typename V, typename T = V>
ROCBLAS_KERNEL(NB)
rocblas_dot_kernel_reduce(int n_sums, V* __restrict__ in, T* __restrict__ out)
{
V sum = 0;

size_t offset = size_t(blockIdx.x) * n_sums;
in += offset;

int inc = NB * WIN;

int i = threadIdx.x * WIN;
int remainder = n_sums % WIN;
int end = n_sums - remainder;
for(; i < end; i += inc) // cover all sums as 1 block
{
for(int j = 0; j < WIN; j++)
sum += in[i + j];
}
if(threadIdx.x < remainder)
{
sum += in[n_sums - 1 - threadIdx.x];
}

sum = rocblas_dot_block_reduce<NB>(sum);
if(threadIdx.x == 0)
out[blockIdx.x] = T(sum);
}

template <typename API_INT, int NB_X, int NB_Y, bool CONJ, typename V, typename T, typename U>
ROCBLAS_KERNEL(NB_X* NB_Y)
rocblas_dot_batched_4_kernel(rocblas_int n,
Expand Down Expand Up @@ -637,14 +608,15 @@ rocblas_status rocblas_internal_dot_launcher(rocblas_handle __restrict__ handle,
workspace,
output);

ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<DOT_NB, DOT_NELEM>),
dim3(batch_count),
threads,
0,
handle->get_stream(),
blocks,
workspace,
output);
ROCBLAS_LAUNCH_KERNEL(
(rocblas_reduction_kernel_part2<DOT_NB, DOT_NELEM, rocblas_finalize_identity>),
dim3(batch_count),
threads,
0,
handle->get_stream(),
blocks,
workspace,
output);

if(handle->pointer_mode == rocblas_pointer_mode_host)
{
Expand Down Expand Up @@ -733,14 +705,15 @@ rocblas_status rocblas_internal_dot_launcher(rocblas_handle __restrict__ handle,
}

if(blocks > 1) // if single block first kernel did all work
ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
dim3(batch_count),
threads,
0,
handle->get_stream(),
blocks,
workspace,
output);
ROCBLAS_LAUNCH_KERNEL(
(rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
dim3(batch_count),
threads,
0,
handle->get_stream(),
blocks,
workspace,
output);

if(handle->pointer_mode == rocblas_pointer_mode_host)
{
Expand Down
34 changes: 33 additions & 1 deletion library/src/blas1/rocblas_reduction.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -249,6 +249,38 @@ size_t rocblas_reduction_workspace_non_chunked_size(API_INT n, API_INT batch_cou
return sizeof(To) * (blocks + 1) * batch_count;
}

/*! \brief rocblas_reduction_kernel_part2
gathers all the partial results in workspace and finishes the final reduction.
********************************************************************/
template <int NB, int WIN, typename FINALIZE, typename V, typename T = V>
ROCBLAS_KERNEL(NB)
rocblas_reduction_kernel_part2(int n_sums, V* __restrict__ in, T* __restrict__ out)
{
V sum = 0;

size_t offset = size_t(blockIdx.x) * n_sums;
in += offset;

int inc = NB * WIN;

int i = threadIdx.x * WIN;
int remainder = n_sums % WIN;
int end = n_sums - remainder;
for(; i < end; i += inc) // cover all sums as 1 block
{
for(int j = 0; j < WIN; j++)
sum += in[i + j];
}
if(threadIdx.x < remainder)
{
sum += in[n_sums - 1 - threadIdx.x];
}

sum = rocblas_dot_block_reduce<NB>(sum);
if(threadIdx.x == 0)
out[blockIdx.x] = T(FINALIZE{}(sum));
}

/*! \brief rocblas_reduction_batched_kernel_workspace_size
Work area for reductions where full reduction to single value occurs
Additional passes add to workspace requirement for ILP64 subdivisions but size limited to chunks
Expand Down
36 changes: 19 additions & 17 deletions library/src/src64/blas1/rocblas_dot_kernels_64.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -137,14 +137,15 @@ rocblas_status rocblas_internal_dot_launcher_64(rocblas_handle __restrict__ hand
// reduce n partitions within batch chunk

// sum partial_results to results always needed if only to down convert
ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
dim3(batch_count),
dim3(NB),
0,
handle->get_stream(),
n_passes,
partial_results,
output);
ROCBLAS_LAUNCH_KERNEL(
(rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
dim3(batch_count),
dim3(NB),
0,
handle->get_stream(),
n_passes,
partial_results,
output);

if(handle->pointer_mode == rocblas_pointer_mode_host)
{
Expand Down Expand Up @@ -230,14 +231,15 @@ rocblas_status rocblas_internal_dot_launcher_64(rocblas_handle __restrict__ hand

// reduce n partitions within batch chunk
// sum partial_results to results always needed as may down convert
ROCBLAS_LAUNCH_KERNEL((rocblas_dot_kernel_reduce<NB, WIN>),
dim3(batch_count),
dim3(NB),
0,
handle->get_stream(),
n_passes,
partial_results,
output);
ROCBLAS_LAUNCH_KERNEL(
(rocblas_reduction_kernel_part2<NB, WIN, rocblas_finalize_identity>),
dim3(batch_count),
dim3(NB),
0,
handle->get_stream(),
n_passes,
partial_results,
output);

if(handle->pointer_mode == rocblas_pointer_mode_host)
{
Expand Down

0 comments on commit c7c0ac4

Please sign in to comment.