additional replacements

NVIDIA · Feb 11, 2025 · 251a174 · 251a174
1 parent b016219
commit 251a174
Show file tree

Hide file tree

Showing 20 changed files with 439 additions and 443 deletions.
diff --git a/.clang-format b/.clang-format
@@ -87,9 +87,6 @@ ContinuationIndentWidth: 2
 EmptyLineAfterAccessModifier: Never
 EmptyLineBeforeAccessModifier: Always
 FixNamespaceComments: true
-IfMacros: [
-  '_CCCL_IF_CONSTEXPR'
-]
 IndentWrappedFunctionNames: false
 IncludeBlocks:   Regroup
 IncludeCategories:

diff --git a/docs/repo.toml b/docs/repo.toml
@@ -422,7 +422,6 @@ doxygen_predefined = [
   "_CCCL_HIDE_FROM_ABI=",
   "_CCCL_HOST=",
   "_CCCL_HOST_DEVICE=",
-  "_CCCL_IF_CONSTEXPR=if constexpr",
   "_CCCL_INLINE_VAR=inline",
   "_CCCL_NODISCARD=[[nodiscard]]",
   "_CCCL_NODISCARD_FRIEND=",

diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
@@ -113,15 +113,15 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2}], "
         "[%3];"
         :
         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2}], "
         "[%3];"
@@ -290,7 +290,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, "
         "%3}], [%4];"
@@ -302,7 +302,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, "
         "%3}], [%4];"
@@ -477,7 +477,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
         "%4}], [%5];"
@@ -490,7 +490,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
         "%4}], [%5];"
@@ -673,7 +673,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
         "%4, %5}], [%6];"
@@ -687,7 +687,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
         "%4, %5}], [%6];"
@@ -875,7 +875,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, "
         "%4, %5, %6}], [%7];"
@@ -890,7 +890,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, "
         "%4, %5, %6}], [%7];"

diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h
@@ -82,7 +82,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, "
         "{%2, %3, %4, %5, %6}], [%7];"
@@ -97,7 +97,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
           "r"(__as_ptr_smem(__smem_bar))
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, "
         "{%2, %3, %4, %5, %6}], [%7];"
@@ -203,7 +203,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster."
         "cta_group::1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
@@ -219,7 +219,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster."
         "cta_group::2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"

diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
@@ -84,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":1 [%0], [%1, {%2}], [%3], %4;"
@@ -96,7 +96,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":2 [%0], [%1, {%2}], [%3], %4;"
@@ -197,7 +197,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":1 [%0], [%1, {%2, %3}], [%4], %5;"
@@ -210,7 +210,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":2 [%0], [%1, {%2, %3}], [%4], %5;"
@@ -313,7 +313,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":1 [%0], [%1, {%2, %3, %4}], [%5], %6;"
@@ -327,7 +327,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":2 [%0], [%1, {%2, %3, %4}], [%5], %6;"
@@ -432,7 +432,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":1 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;"
@@ -447,7 +447,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":2 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;"
@@ -554,7 +554,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
   // __space == space_global (due to parameter type constraint)
   static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, "");
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL
-  _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1)
+  if constexpr (__cta_group == cta_group_1)
   {
     asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"
@@ -570,7 +570,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
           "h"(__ctaMask)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2)
+  else if constexpr (__cta_group == cta_group_2)
   {
     asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:"
         ":2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;"

diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
@@ -665,14 +665,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
 // __op == op_and_op (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
+  if constexpr (sizeof(_Type) == 4)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [%0], [%1], %2; // 3."
         :
         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
+  else if constexpr (sizeof(_Type) == 8)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
         :
@@ -712,14 +712,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
 // __op == op_or_op (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
+  if constexpr (sizeof(_Type) == 4)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [%0], [%1], %2; // 3."
         :
         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
+  else if constexpr (sizeof(_Type) == 8)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
         :
@@ -759,14 +759,14 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
   static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
 // __op == op_xor_op (due to parameter type constraint)
 #  if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900
-  _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4)
+  if constexpr (sizeof(_Type) == 4)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [%0], [%1], %2; // 3."
         :
         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
         : "memory");
   }
-  else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8)
+  else if constexpr (sizeof(_Type) == 8)
   {
     asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
         :