Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace cub::Traits by numeric_limits and deprecate it #3384

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ struct bfloat16_t
}
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif

/******************************************************************************
* I/O stream overloads
******************************************************************************/
Expand All @@ -230,18 +234,17 @@ inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
}

/******************************************************************************
* Traits overloads
* traits and limits
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct __is_extended_floating_point<bfloat16_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
#ifndef _CCCL_NO_INLINE_VARIABLES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<bfloat16_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES
#endif // _CCCL_NO_INLINE_VARIABLES

template <>
class numeric_limits<bfloat16_t>
Expand All @@ -266,13 +269,13 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP
struct CUB_NS_QUALIFIER::detail::unsigned_bits<bfloat16_t, void>
{
using type = unsigned short;
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
// template <>
// struct CUB_NS_QUALIFIER::detail::NumericTraits<bfloat16_t>
// : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
// {};
26 changes: 15 additions & 11 deletions c2h/include/c2h/half.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ struct half_t
}
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif

/******************************************************************************
* I/O stream overloads
******************************************************************************/
Expand All @@ -325,18 +329,17 @@ inline std::ostream& operator<<(std::ostream& out, const __half& x)
}

/******************************************************************************
* Traits overloads
* traits and limits
******************************************************************************/

_LIBCUDACXX_BEGIN_NAMESPACE_STD
template <>
struct __is_extended_floating_point<half_t> : true_type
{};

#ifndef _CCCL_NO_VARIABLE_TEMPLATES
#ifndef _CCCL_NO_INLINE_VARIABLES
template <>
_CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<half_t> = true;
#endif // _CCCL_NO_VARIABLE_TEMPLATES
#endif // _CCCL_NO_INLINE_VARIABLES

template <>
class numeric_limits<half_t>
Expand All @@ -361,12 +364,13 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP
struct CUB_NS_QUALIFIER::detail::unsigned_bits<half_t, void>
{
using type = unsigned short;
};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
// template <>
// struct CUB_NS_QUALIFIER::detail::NumericTraits<half_t>
// : CUB_NS_QUALIFIER::detail::BaseTraits<FLOATING_POINT, unsigned short, half_t>
// {};
19 changes: 1 addition & 18 deletions c2h/include/c2h/test_util_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
C2H_VEC_OVERLOAD(float, float)
C2H_VEC_OVERLOAD(double, double)

// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
// Specialize cuda::std::numeric_limits for vector types.

# define REPEAT_TO_LIST_1(a) a
# define REPEAT_TO_LIST_2(a) a, a
Expand All @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
# define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)

# define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \
CUB_NAMESPACE_BEGIN \
template <> \
struct NumericTraits<T> \
{ \
static __host__ __device__ T Max() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())}; \
return retval; \
} \
static __host__ __device__ T Lowest() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())}; \
return retval; \
} \
}; \
CUB_NAMESPACE_END \
\
_LIBCUDACXX_BEGIN_NAMESPACE_STD \
template <> \
class numeric_limits<T> \
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/reduce/arg_extrema.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ struct policy_hub_t
// Type used for the final result
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;

auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
? ::cuda::std::numeric_limits<T>::max()
: ::cuda::std::numeric_limits<T>::lowest();

#if !TUNE_BASE
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ struct AgentReduceByKey
// Whether or not the scan operation has a zero-valued identity value (true
// if we're performing addition on a primitive type)
static constexpr int HAS_IDENTITY_ZERO =
(::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && (is_primitive<AccumT>::value);
(::cuda::std::is_same_v<ReductionOpT, ::cuda::std::plus<>>) && is_primitive<AccumT>::value;

// Cache-modified Input iterator wrapper type (for applying cache modifier)
// for keys Wrap the native input pointer with
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class AgentSubWarpSort

_CCCL_DEVICE static bool get_oob_default(::cuda::std::true_type /* is bool */)
{
// Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
// key_traits<KeyT>::max_key for `bool` is 0xFF which is different from `true` and makes
// comparison with oob unreliable.
return !IS_DESCENDING;
}
Expand Down
36 changes: 21 additions & 15 deletions cub/cub/block/radix_rank_sort_operations.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/cstdint>
#include <cuda/std/tuple>
#include <cuda/std/type_traits>
#include <cuda/type_traits>
#include <cuda/type_traits>

CUB_NAMESPACE_BEGIN
Expand All @@ -77,8 +77,7 @@ CUB_NAMESPACE_BEGIN
template <typename KeyT, bool IsFP = ::cuda::is_floating_point_v<KeyT>>
struct BaseDigitExtractor
{
using TraitsT = Traits<KeyT>;
using UnsignedBits = typename TraitsT::UnsignedBits;
using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
Expand All @@ -89,14 +88,13 @@ struct BaseDigitExtractor
template <typename KeyT>
struct BaseDigitExtractor<KeyT, true>
{
using TraitsT = Traits<KeyT>;
using UnsignedBits = typename TraitsT::UnsignedBits;
using UnsignedBits = typename key_traits<KeyT>::unsigned_bits;

static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
{
UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
key_traits<KeyT>::twiddle_in(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
UnsignedBits TWIDDLED_ZERO_BITS = key_traits<KeyT>::twiddle_in(0);
return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
}
};
Expand Down Expand Up @@ -209,7 +207,7 @@ struct is_fundamental_type
};

template <class T>
struct is_fundamental_type<T, ::cuda::std::void_t<typename Traits<T>::UnsignedBits>>
struct is_fundamental_type<T, ::cuda::std::void_t<typename key_traits<T>::unsigned_bits>>
{
static constexpr bool value = true;
};
Expand All @@ -233,23 +231,23 @@ using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_
template <class T>
struct bit_ordered_conversion_policy_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;

static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
{
return Traits<T>::TwiddleIn(val);
return key_traits<T>::twiddle_in(val);
}

static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
{
return Traits<T>::TwiddleOut(val);
return key_traits<T>::twiddle_out(val);
}
};

template <class T>
struct bit_ordered_inversion_policy_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;

static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
{
Expand All @@ -260,7 +258,7 @@ struct bit_ordered_inversion_policy_t
template <class T, bool = is_fundamental_type<T>::value>
struct traits_t
{
using bit_ordered_type = typename Traits<T>::UnsignedBits;
using bit_ordered_type = typename key_traits<T>::unsigned_bits;
using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
using bit_ordered_inversion_policy = bit_ordered_inversion_policy_t<T>;

Expand All @@ -269,12 +267,20 @@ struct traits_t

static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t)
{
return Traits<T>::LOWEST_KEY;
// TODO(bgruber): sanity check, remove eventually
_CCCL_SUPPRESS_DEPRECATED_PUSH
static_assert(key_traits<T>::lowest_key == Traits<T>::LOWEST_KEY, "");
_CCCL_SUPPRESS_DEPRECATED_POP
return key_traits<T>::lowest_key;
}

static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t)
{
return Traits<T>::MAX_KEY;
// TODO(bgruber): sanity check, remove eventually
_CCCL_SUPPRESS_DEPRECATED_PUSH
static_assert(key_traits<T>::max_key == Traits<T>::MAX_KEY, "");
_CCCL_SUPPRESS_DEPRECATED_POP
return key_traits<T>::max_key;
}

static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t)
Expand Down
23 changes: 10 additions & 13 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@

#include <thrust/iterator/tabulate_output_iterator.h>

#include <cuda/std/limits>

#include <iterator>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -334,7 +336,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide minimum using the less-than (``<``) operator.
//!
//! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -433,8 +435,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::minimum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
Traits<InitT>::Max(),
::cuda::std::numeric_limits<InitT>::max(),
stream);
}

Expand Down Expand Up @@ -583,7 +584,7 @@ struct DeviceReduce
//! (assuming the value type of ``d_in`` is ``T``)
//!
//! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -690,8 +691,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
Expand All @@ -700,7 +700,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide maximum using the greater-than (``>``) operator.
//!
//! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -796,8 +796,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::maximum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
Traits<InitT>::Lowest(),
::cuda::std::numeric_limits<InitT>::lowest(),
stream);
}

Expand Down Expand Up @@ -948,7 +947,7 @@ struct DeviceReduce
//!
//! - The maximum is written to ``d_out.value`` and its offset in the input
//! array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -1057,9 +1056,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
Expand Down
Loading
Loading