Skip to content

Commit

Permalink
additional replacements
Browse files Browse the repository at this point in the history
  • Loading branch information
fbusato committed Feb 11, 2025
1 parent b016219 commit 251a174
Show file tree
Hide file tree
Showing 20 changed files with 439 additions and 443 deletions.
3 changes: 0 additions & 3 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ ContinuationIndentWidth: 2
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: Always
FixNamespaceComments: true
IfMacros: [
'_CCCL_IF_CONSTEXPR'
]
IndentWrappedFunctionNames: false
IncludeBlocks: Regroup
IncludeCategories:
Expand Down
1 change: 0 additions & 1 deletion docs/repo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ doxygen_predefined = [
"_CCCL_HIDE_FROM_ABI=",
"_CCCL_HOST=",
"_CCCL_HOST_DEVICE=",
"_CCCL_IF_CONSTEXPR=if constexpr",
"_CCCL_INLINE_VAR=inline",
"_CCCL_NODISCARD=[[nodiscard]]",
"_CCCL_NODISCARD_FRIEND=",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2}], "
"[%3];"
:
: "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2}], "
"[%3];"
Expand Down Expand Up @@ -290,7 +290,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, "
"%3}], [%4];"
Expand All @@ -302,7 +302,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, "
"%3}], [%4];"
Expand Down Expand Up @@ -477,7 +477,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
"%4}], [%5];"
Expand All @@ -490,7 +490,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
"%4}], [%5];"
Expand Down Expand Up @@ -673,7 +673,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
"%4, %5}], [%6];"
Expand All @@ -687,7 +687,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
"%4, %5}], [%6];"
Expand Down Expand Up @@ -875,7 +875,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
"%4, %5, %6}], [%7];"
Expand All @@ -890,7 +890,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
"%4, %5, %6}], [%7];"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, "
"{%2, %3, %4, %5, %6}], [%7];"
Expand All @@ -97,7 +97,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
"r"(__as_ptr_smem(__smem_bar))
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, "
"{%2, %3, %4, %5, %6}], [%7];"
Expand Down Expand Up @@ -203,7 +203,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster."
"cta_group::1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
Expand All @@ -219,7 +219,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster."
"cta_group::2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":1 [%0], [%1, {%2}], [%3], %4;"
Expand All @@ -96,7 +96,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":2 [%0], [%1, {%2}], [%3], %4;"
Expand Down Expand Up @@ -197,7 +197,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":1 [%0], [%1, {%2, %3}], [%4], %5;"
Expand All @@ -210,7 +210,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":2 [%0], [%1, {%2, %3}], [%4], %5;"
Expand Down Expand Up @@ -313,7 +313,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":1 [%0], [%1, {%2, %3, %4}], [%5], %6;"
Expand All @@ -327,7 +327,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":2 [%0], [%1, {%2, %3, %4}], [%5], %6;"
Expand Down Expand Up @@ -432,7 +432,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":1 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;"
Expand All @@ -447,7 +447,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":2 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;"
Expand Down Expand Up @@ -554,7 +554,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
// __space == space_global (due to parameter type constraint)
static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
_CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
if constexpr (__cta_group == cta_group_1)
{
asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
Expand All @@ -570,7 +570,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
"h"(__ctaMask)
: "memory");
}
else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
else if constexpr (__cta_group == cta_group_2)
{
asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
":2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,14 +665,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
// __op == op_and_op (due to parameter type constraint)
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
_CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
if constexpr (sizeof(_Type) == 4)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3."
:
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
: "memory");
}
else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
else if constexpr (sizeof(_Type) == 8)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3."
:
Expand Down Expand Up @@ -712,14 +712,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
// __op == op_or_op (due to parameter type constraint)
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
_CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
if constexpr (sizeof(_Type) == 4)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3."
:
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
: "memory");
}
else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
else if constexpr (sizeof(_Type) == 8)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3."
:
Expand Down Expand Up @@ -759,14 +759,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
// __op == op_xor_op (due to parameter type constraint)
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
_CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
if constexpr (sizeof(_Type) == 4)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3."
:
: "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
: "memory");
}
else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
else if constexpr (sizeof(_Type) == 8)
{
asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3."
:
Expand Down
Loading

0 comments on commit 251a174

Please sign in to comment.