-
Notifications
You must be signed in to change notification settings - Fork 53
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add GEMM device function #880
Open
AGonzales-amd
wants to merge
8
commits into
ROCm:develop
Choose a base branch
from
AGonzales-amd:mfma-gemm-internal
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+1,402
−204
Open
Changes from 3 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
a4baff0
add mfma internal gemm kernel and device functions
AGonzales-amd 64d9825
temp change
AGonzales-amd 262fb78
revert temp change
AGonzales-amd 24e7364
documentation and formatting
AGonzales-amd eefef23
fixes
AGonzales-amd e937d3f
rocsolver_gemm test
AGonzales-amd de84520
Merge branch 'develop' into mfma-gemm-internal
AGonzales-amd 288ac4d
windows ci
AGonzales-amd File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
258 changes: 258 additions & 0 deletions
258
library/src/specialized/roclapack_gemm_device_functions.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
/* ************************************************************************** | ||
* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. | ||
* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions | ||
* are met: | ||
* | ||
* 1. Redistributions of source code must retain the above copyright | ||
* notice, this list of conditions and the following disclaimer. | ||
* | ||
* 2. Redistributions in binary form must reproduce the above copyright | ||
* notice, this list of conditions and the following disclaimer in the | ||
* documentation and/or other materials provided with the distribution. | ||
* | ||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | ||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
* SUCH DAMAGE. | ||
* *************************************************************************/ | ||
|
||
#pragma once | ||
|
||
#include "rocblas.hpp" | ||
#include "rocsolver/rocsolver.h" | ||
|
||
#if defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) | ||
#define ROCSOLVER_MFMA_ENABLED 1 | ||
#else | ||
#define ROCSOLVER_MFMA_ENABLED 0 | ||
#endif // defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) | ||
|
||
ROCSOLVER_BEGIN_NAMESPACE | ||
|
||
#if ROCSOLVER_MFMA_ENABLED | ||
|
||
template <typename T> | ||
struct mfma_16x16x4_base | ||
{ | ||
using RegT = T; | ||
using AccT = __attribute__( (__vector_size__(4 * sizeof(T)) )) T; | ||
}; | ||
|
||
template <typename T> | ||
struct mfma_16x16x4; | ||
|
||
// float specialization | ||
template <> | ||
struct mfma_16x16x4<float>: public mfma_16x16x4_base<float> | ||
{ | ||
__device__ inline auto operator()(const RegT& a, const RegT& b, const AccT& c) const | ||
{ | ||
return __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0); | ||
} | ||
}; | ||
|
||
// double specialization | ||
template <> | ||
struct mfma_16x16x4<double>: public mfma_16x16x4_base<double> | ||
{ | ||
__device__ inline auto operator()(const RegT& a, const RegT& b, const AccT& c) const | ||
{ | ||
return __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0); | ||
} | ||
}; | ||
|
||
// complex specialization | ||
template <typename T> | ||
struct mfma_16x16x4 | ||
{ | ||
using RegT = T; | ||
using AccT = std::array<T, 4>; | ||
|
||
using S = decltype(std::real(T{})); | ||
using RegS = typename mfma_16x16x4_base<S>::RegT; | ||
using AccS = typename mfma_16x16x4_base<S>::AccT; | ||
|
||
__device__ inline auto operator()(const RegT& a, const RegT& b, const AccT& c) const | ||
{ | ||
RegS ar = a.real(); | ||
RegS ai = a.imag(); | ||
RegS br = b.real(); | ||
RegS bi = b.imag(); | ||
AccS cr = {c[0].real(), c[1].real(), c[2].real(), c[3].real()}; | ||
AccS ci = {c[0].imag(), c[1].imag(), c[2].imag(), c[3].imag()}; | ||
AccS zero = {0}; | ||
|
||
const auto mfma_S = mfma_16x16x4<S>(); | ||
|
||
// real x real | ||
auto arbr = mfma_S(ar, br, zero); | ||
|
||
// real x imag | ||
auto arbi = mfma_S(ar, bi, zero); | ||
|
||
// imag x real | ||
auto aibr = mfma_S(ai, br, zero); | ||
|
||
// imag x imag | ||
auto aibi = mfma_S(ai, bi, zero); | ||
|
||
// cr += r x r - i x i | ||
cr += arbr - aibi; | ||
// ci += r x i + i x r | ||
ci += arbi + aibr; | ||
|
||
return AccT{rocblas_complex_num<S>(cr[0], ci[0]), | ||
rocblas_complex_num<S>(cr[1], ci[1]), | ||
rocblas_complex_num<S>(cr[2], ci[2]), | ||
rocblas_complex_num<S>(cr[3], ci[3])}; | ||
} | ||
}; | ||
|
||
template <typename T, typename I, std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, rocblas_float_complex>, int> = 0> | ||
__device__ inline I get_c_col(I li, I lj, I gpri, I inc_C, I ldc) | ||
{ | ||
return lj; | ||
} | ||
|
||
template <typename T, typename I, std::enable_if_t<std::is_same_v<T, double> || std::is_same_v<T, rocblas_double_complex>, int> = 0> | ||
__device__ inline I get_c_col(I li, I lj, I gpri, I inc_C, I ldc) | ||
{ | ||
return lj; | ||
} | ||
|
||
template <typename T, typename I, std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, rocblas_float_complex>, int> = 0> | ||
__device__ inline I get_c_row(I li, I lj, I gpri, I inc_C, I ldc) | ||
{ | ||
return gpri + li * 4; | ||
} | ||
|
||
template <typename T, typename I, std::enable_if_t<std::is_same_v<T, double> || std::is_same_v<T, rocblas_double_complex>, int> = 0> | ||
__device__ inline I get_c_row(I li, I lj, I gpri, I inc_C, I ldc) | ||
{ | ||
return gpri * 4 + li; | ||
} | ||
|
||
/** GEMM device function to compute C = alpha * A * B + beta * C. | ||
|
||
Where C is an m x n matrix, A is an m x p matrix, and B is an | ||
p x n matrix. This is a wave function, every lane of the wave | ||
must perform call this function. | ||
- m: 0 < m <= 16 | ||
- n: 0 < n <= 16 | ||
- p: 0 < p | ||
**/ | ||
// Run with warpSize sized block | ||
template <typename T, typename I> | ||
__device__ void gemm_16x16xp(rocblas_operation transA, | ||
rocblas_operation transB, | ||
I m, | ||
I n, | ||
I p, | ||
T alpha, | ||
const T *A, | ||
I inc_A, | ||
I lda, | ||
const T *B, | ||
I inc_B, | ||
I ldb, | ||
T beta, | ||
T *C, | ||
I inc_C, | ||
I ldc) | ||
{ | ||
using T4 = typename mfma_16x16x4<T>::AccT; | ||
|
||
const I lid = threadIdx.x % warpSize; | ||
|
||
const I cmajor_i_16x4 = lid % 16; | ||
const I cmajor_j_16x4 = lid / 16; | ||
|
||
const I cmajor_i_4x16 = lid % 4; | ||
const I cmajor_j_4x16 = lid / 4; | ||
|
||
const I rmajor_i_4x16 = cmajor_j_16x4; | ||
const I rmajor_j_4x16 = cmajor_i_16x4; | ||
|
||
// addresses to transpose B from col-major to row-major | ||
// and transpose C from row-major to col-major | ||
const auto c2r_src = rmajor_j_4x16 * 4 + rmajor_i_4x16; | ||
const auto r2c_src = cmajor_i_4x16 * 16 + cmajor_j_4x16; | ||
|
||
T4 dmn = {0}; | ||
|
||
for(I kb = 0; kb < p; kb += 4) | ||
{ | ||
// read A and B in col-major | ||
T amk = 0; | ||
T bkn = 0; | ||
|
||
// load A | ||
if(transA == rocblas_operation_none) | ||
{ | ||
// read col major 16x4 A | ||
if(cmajor_i_16x4 < m && (kb + cmajor_j_16x4) < p) | ||
amk = A[(kb + cmajor_j_16x4) * lda + cmajor_i_16x4 * inc_A]; | ||
} | ||
else | ||
{ | ||
// read col major 4x16 op(A) | ||
if(cmajor_j_4x16 < m && (kb + cmajor_i_4x16) < p) | ||
amk = A[cmajor_j_4x16 * lda + (kb + cmajor_i_4x16) * inc_A]; | ||
|
||
// transpose op(A) to 16x4 | ||
amk = shfl(amk, c2r_src); | ||
} | ||
|
||
// load B | ||
if(transB == rocblas_operation_none) | ||
{ | ||
// read col major 4x16 B | ||
if(cmajor_j_4x16 < n && (kb + cmajor_i_4x16) < p) | ||
bkn = B[cmajor_j_4x16 * ldb + (kb + cmajor_i_4x16) * inc_B]; | ||
|
||
// transpose B to row major | ||
bkn = shfl(bkn, c2r_src); | ||
} | ||
else | ||
{ | ||
// read col major 16x4 op(B) | ||
if(cmajor_i_16x4 < n && (kb + cmajor_j_16x4) < p) | ||
bkn = B[(kb + cmajor_j_16x4) * ldb + cmajor_i_16x4 * inc_B]; | ||
} | ||
|
||
if constexpr (rocblas_is_complex<T>) | ||
if(transA == rocblas_operation_conjugate_transpose) | ||
amk = conj(amk); | ||
if(transB == rocblas_operation_conjugate_transpose) | ||
bkn = conj(bkn); | ||
|
||
dmn = mfma_16x16x4<T>()(amk, bkn, dmn); | ||
} | ||
|
||
#pragma unroll | ||
for (I i = 0; i < 4; ++i) | ||
{ | ||
const I c_col = get_c_col<T>(cmajor_i_4x16, cmajor_j_4x16, i, inc_C, ldc); | ||
const I c_row = get_c_row<T>(cmajor_i_4x16, cmajor_j_4x16, i, inc_C, ldc); | ||
const I idx = (c_col * ldc) + (c_row * inc_C); | ||
|
||
// transpose C to col major | ||
dmn[i] = shfl(dmn[i], r2c_src); | ||
|
||
if(c_col < n && c_row < m) | ||
C[idx] = alpha * dmn[i] + beta * C[idx]; | ||
} | ||
} | ||
|
||
#endif // ROCSOLVER_MFMA_ENABLED | ||
|
||
ROCSOLVER_END_NAMESPACE |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the inc_A, inc_B, ... etc the same as "stride" like strideA? If not, perhaps some description would be helpful. Just a suggestion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added function documentation, thanks for the suggestion