NVIDIA · bernhardmgruber · Feb 17, 2025
@@ -212,6 +212,10 @@ struct bfloat16_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -230,18 +234,17 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * traits and limits
  ******************************************************************************/
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
 struct __is_extended_floating_point<bfloat16_t> : true_type
 {};
-
-#ifndef _CCCL_NO_VARIABLE_TEMPLATES
+#ifndef _CCCL_NO_INLINE_VARIABLES
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
-#endif // _CCCL_NO_VARIABLE_TEMPLATES
+#endif // _CCCL_NO_INLINE_VARIABLES
 
 template <>
 class numeric_limits<bfloat16_t>
@@ -266,13 +269,13 @@ public:
 };
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
-{};
-_CCCL_SUPPRESS_DEPRECATED_POP
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<bfloat16_t, void>
+{
+  using type = unsigned short;
+};
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+// template <>
+// struct CUB_NS_QUALIFIER::detail::NumericTraits<bfloat16_t>
+//     : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
+// {};
@@ -307,6 +307,10 @@ struct half_t
   }
 };
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
 /******************************************************************************
  * I/O stream overloads
  ******************************************************************************/
@@ -325,18 +329,17 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
 }
 
 /******************************************************************************
- * Traits overloads
+ * traits and limits
  ******************************************************************************/
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <>
 struct __is_extended_floating_point<half_t> : true_type
 {};
-
-#ifndef _CCCL_NO_VARIABLE_TEMPLATES
+#ifndef _CCCL_NO_INLINE_VARIABLES
 template <>
 _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
-#endif // _CCCL_NO_VARIABLE_TEMPLATES
+#endif // _CCCL_NO_INLINE_VARIABLES
 
 template <>
 class numeric_limits<half_t>
@@ -361,12 +364,13 @@ public:
 };
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
 template <>
-struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
-{};
-_CCCL_SUPPRESS_DEPRECATED_POP
+struct CUB_NS_QUALIFIER::detail::unsigned_bits<half_t, void>
+{
+  using type = unsigned short;
+};
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic pop
-#endif
+// template <>
+// struct CUB_NS_QUALIFIER::detail::NumericTraits<half_t>
+//     : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, half_t>
+// {};
@@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
 C2H_VEC_OVERLOAD(float, float)
 C2H_VEC_OVERLOAD(double, double)
 
-// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
+// Specialize cuda::std::numeric_limits for vector types.
 
 #  define REPEAT_TO_LIST_1(a)  a
 #  define REPEAT_TO_LIST_2(a)  a, a
@@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
 #  define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)
 
 #  define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N)                               \
-    CUB_NAMESPACE_BEGIN                                                           \
-    template <>                                                                   \
-    struct NumericTraits<T>                                                       \
-    {                                                                             \
-      static __host__ __device__ T Max()                                          \
-      {                                                                           \
-        T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())};              \
-        return retval;                                                            \
-      }                                                                           \
-      static __host__ __device__ T Lowest()                                       \
-      {                                                                           \
-        T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())};           \
-        return retval;                                                            \
-      }                                                                           \
-    };                                                                            \
-    CUB_NAMESPACE_END                                                             \
-                                                                                  \
     _LIBCUDACXX_BEGIN_NAMESPACE_STD                                               \
     template <>                                                                   \
     class numeric_limits<T>                                                       \

@@ -57,7 +57,9 @@ struct policy_hub_t
     // Type used for the final result
     using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
 
-    auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
+    auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
+                      ? ::cuda::std::numeric_limits<T>::max()
+                      : ::cuda::std::numeric_limits<T>::lowest();
 
 #if !TUNE_BASE
     using policy_t   = policy_hub_t<output_tuple_t, per_partition_offset_t>;

@@ -228,7 +228,7 @@ struct AgentReduceByKey
   // Whether or not the scan operation has a zero-valued identity value (true
   // if we're performing addition on a primitive type)
   static constexpr int HAS_IDENTITY_ZERO =
-    (::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && (is_primitive<AccumT>::value);
+    (::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && is_primitive<AccumT>::value;
 
   // Cache-modified Input iterator wrapper type (for applying cache modifier)
   // for keys Wrap the native input pointer with

@@ -161,7 +161,7 @@ class AgentSubWarpSort
 
   _CCCL_DEVICE static bool get_oob_default(::cuda::std::true_type /* is bool */)
   {
-    // Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
+    // key_traits<KeyT>::max_key for `bool` is 0xFF which is different from `true` and makes
     // comparison with oob unreliable.
     return !IS_DESCENDING;
   }

@@ -53,7 +53,7 @@
 #include <cuda/std/__algorithm/min.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/tuple>
-#include <cuda/std/type_traits>
+#include <cuda/type_traits>
 #include <cuda/type_traits>
 
 CUB_NAMESPACE_BEGIN
@@ -77,8 +77,7 @@ CUB_NAMESPACE_BEGIN
 template <typename KeyT, bool IsFP = ::cuda::is_floating_point_v<KeyT>>
 struct BaseDigitExtractor
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
@@ -89,14 +88,13 @@ struct BaseDigitExtractor
 template <typename KeyT>
 struct BaseDigitExtractor<KeyT, true>
 {
-  using TraitsT      = Traits<KeyT>;
-  using UnsignedBits = typename TraitsT::UnsignedBits;
+  using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
   {
     UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
-      TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
-    UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+      key_traits<KeyT>::twiddle_in(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+    UnsignedBits TWIDDLED_ZERO_BITS = key_traits<KeyT>::twiddle_in(0);
     return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
   }
 };
@@ -209,7 +207,7 @@ struct is_fundamental_type
 };
 
 template <class T>
-struct is_fundamental_type<T, ::cuda::std::void_t<typename Traits<T>::UnsignedBits>>
+struct is_fundamental_type<T, ::cuda::std::void_t<typename key_traits<T>::unsigned_bits>>
 {
   static constexpr bool value = true;
 };
@@ -233,23 +231,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_
 template <class T>
 struct bit_ordered_conversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename key_traits<T>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleIn(val);
+    return key_traits<T>::twiddle_in(val);
   }
 
   static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
   {
-    return Traits<T>::TwiddleOut(val);
+    return key_traits<T>::twiddle_out(val);
   }
 };
 
 template <class T>
 struct bit_ordered_inversion_policy_t
 {
-  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type = typename key_traits<T>::unsigned_bits;
 
   static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
   {
@@ -260,7 +258,7 @@ struct bit_ordered_inversion_policy_t
 template <class T, bool = is_fundamental_type<T>::value>
 struct traits_t
 {
-  using bit_ordered_type              = typename Traits<T>::UnsignedBits;
+  using bit_ordered_type              = typename key_traits<T>::unsigned_bits;
   using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
   using bit_ordered_inversion_policy  = bit_ordered_inversion_policy_t<T>;
 
@@ -269,12 +267,20 @@ struct traits_t
 
   static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t)
   {
-    return Traits<T>::LOWEST_KEY;
+    // TODO(bgruber): sanity check, remove eventually
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static_assert(key_traits<T>::lowest_key == Traits<T>::LOWEST_KEY, "");
+    _CCCL_SUPPRESS_DEPRECATED_POP
+    return key_traits<T>::lowest_key;
   }
 
   static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t)
   {
-    return Traits<T>::MAX_KEY;
+    // TODO(bgruber): sanity check, remove eventually
+    _CCCL_SUPPRESS_DEPRECATED_PUSH
+    static_assert(key_traits<T>::max_key == Traits<T>::MAX_KEY, "");
+    _CCCL_SUPPRESS_DEPRECATED_POP
+    return key_traits<T>::max_key;
   }
 
   static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t)

@@ -51,6 +51,8 @@
 
 #include <thrust/iterator/tabulate_output_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -334,7 +336,7 @@ struct DeviceReduce
   //! @rst
   //! Computes a device-wide minimum using the less-than (``<``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
+  //! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
   //! - Does not support ``<`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
@@ -433,8 +435,7 @@ struct DeviceReduce
       d_out,
       static_cast<OffsetT>(num_items),
       ::cuda::minimum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
-      Traits<InitT>::Max(),
+      ::cuda::std::numeric_limits<InitT>::max(),
       stream);
   }
 
@@ -583,7 +584,7 @@ struct DeviceReduce
   //!   (assuming the value type of ``d_in`` is ``T``)
   //!
   //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
-  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
   //!
   //! - Does not support ``<`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -690,8 +691,7 @@ struct DeviceReduce
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
@@ -700,7 +700,7 @@ struct DeviceReduce
   //! @rst
   //! Computes a device-wide maximum using the greater-than (``>``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
@@ -796,8 +796,7 @@ struct DeviceReduce
       d_out,
       static_cast<OffsetT>(num_items),
       ::cuda::maximum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
-      Traits<InitT>::Lowest(),
+      ::cuda::std::numeric_limits<InitT>::lowest(),
       stream);
   }
 
@@ -948,7 +947,7 @@ struct DeviceReduce
   //!
   //!   - The maximum is written to ``d_out.value`` and its offset in the input
   //!     array is written to ``d_out.key``.
-  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
   //!
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -1057,9 +1056,7 @@ struct DeviceReduce
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);