NVIDIA · bernhardmgruber · Feb 7, 2025 · Feb 15, 2025
@@ -14,6 +14,7 @@
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/util_device.cuh>
 
+#include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/functional>
 #include <cuda/std/variant>
@@ -100,8 +101,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
   auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
 
   // Implement part of MemBoundScaling
-  items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
-  block_size       = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
+  items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
+  block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
 
   return {block_size, items_per_thread, vector_load_length};
 }

@@ -63,7 +63,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -63,7 +63,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -61,7 +61,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -63,7 +63,7 @@ struct policy_hub_t
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
     static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+      _CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -36,8 +36,8 @@ struct policy_hub_t
   {
     static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
 
-    static constexpr int ITEMS_PER_THREAD =
-      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+    static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
+      NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
 
     using SelectIfPolicyT =
       cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

@@ -52,6 +52,7 @@
 #include <cub/util_ptx.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/cmath>
 #include <cuda/std/type_traits>
 
 #include <cstdint>
@@ -171,9 +172,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE PointerRange<VectorT>
 GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
 {
   // Data type size used for vectorized stores
-  constexpr size_t out_datatype_size = sizeof(VectorT);
+  constexpr auto out_datatype_size = uint32_t{sizeof(VectorT)};
   // Data type size used for type-aliased loads
-  constexpr size_t in_datatype_size = sizeof(uint32_t);
+  constexpr auto in_datatype_size = uint32_t{sizeof(uint32_t)};
 
   // char-aliased ptrs to simplify pointer arithmetic
   char* out_ptr      = reinterpret_cast<char*>(out_begin);
@@ -194,8 +195,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
   uint32_t in_offset_req = in_extra_bytes;
 
   // Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin`
-  uint32_t out_start_aligned =
-    CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size;
+  uint32_t out_start_aligned = cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);
 
   // Compute the beginning of the aligned ranges (output and input pointers)
   VectorT* out_aligned_begin   = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
@@ -398,7 +398,7 @@ private:
   static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM;
 
   /// The number of backing data types required to store the given number of items
-  static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NumItems, ITEMS_PER_UNIT);
+  static constexpr uint32_t NUM_TOTAL_UNITS = ::cuda::ceil_div(NumItems, ITEMS_PER_UNIT);
 
   /// This is the net number of bit-storage provided by each unit (remainder bits are unused)
   static constexpr uint32_t UNIT_MASK =
@@ -805,7 +805,8 @@ private:
       if (blev_buffer_offset < num_blev_buffers)
       {
         BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
-        block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id], BLOCK_LEVEL_TILE_SIZE);
+        block_offset[i] =
+          ::cuda::ceil_div(static_cast<BlockBufferOffsetT>(tile_buffer_sizes[tile_buffer_id]), BLOCK_LEVEL_TILE_SIZE);
       }
       else
       {

@@ -66,7 +66,7 @@ struct AgentRadixSortHistogramPolicy
      * ID. However, lanes with the same ID in different warp use the same private
      * histogram. This arrangement helps reduce the degree of conflicts in atomic
      * operations. */
-    NUM_PARTS  = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
+    NUM_PARTS  = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)),
     RADIX_BITS = _RADIX_BITS,
   };
 };
@@ -94,16 +94,13 @@ template <typename AgentRadixSortHistogramPolicy,
 struct AgentRadixSortHistogram
 {
   // constants
-  enum
-  {
-    ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
-    BLOCK_THREADS    = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
-    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
-    RADIX_BITS       = AgentRadixSortHistogramPolicy::RADIX_BITS,
-    RADIX_DIGITS     = 1 << RADIX_BITS,
-    MAX_NUM_PASSES   = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
-    NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS,
-  };
+  static constexpr int ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD;
+  static constexpr int BLOCK_THREADS    = AgentRadixSortHistogramPolicy::BLOCK_THREADS;
+  static constexpr int TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int RADIX_BITS       = AgentRadixSortHistogramPolicy::RADIX_BITS;
+  static constexpr int RADIX_DIGITS     = 1 << RADIX_BITS;
+  static constexpr int MAX_NUM_PASSES   = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS;
+  static constexpr int NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS;
 
   using traits                 = radix::traits_t<KeyT>;
   using bit_ordered_type       = typename traits::bit_ordered_type;
@@ -210,7 +207,7 @@ struct AgentRadixSortHistogram
 #pragma unroll
     for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
     {
-      int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
+      int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit);
 #pragma unroll
       for (int u = 0; u < ITEMS_PER_THREAD; ++u)
       {
@@ -258,7 +255,7 @@ struct AgentRadixSortHistogram
 
       // Process the tiles.
       OffsetT portion_offset = portion * MAX_PORTION_SIZE;
-      OffsetT portion_size   = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
+      OffsetT portion_size   = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset);
       for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
       {
         OffsetT tile_offset = portion_offset + offset;

@@ -160,17 +160,17 @@ struct AgentRadixSortUpsweep
     PACKING_RATIO     = sizeof(PackedCounter) / sizeof(DigitCounter),
     LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
 
-    LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
+    LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
     COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
 
     // To prevent counter overflow, we must periodically unpack and aggregate the
     // digit counters back into registers.  Each counter lane is assigned to a
     // warp for aggregation.
 
-    LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+    LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS),
 
     // Unroll tiles in batches without risk of counter overflow
-    UNROLL_COUNT      = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+    UNROLL_COUNT      = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD),
     UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
   };
 

@@ -159,7 +159,7 @@ struct AgentReduce
   static constexpr int BLOCK_THREADS      = AgentReducePolicy::BLOCK_THREADS;
   static constexpr int ITEMS_PER_THREAD   = AgentReducePolicy::ITEMS_PER_THREAD;
   static constexpr int TILE_ITEMS         = BLOCK_THREADS * ITEMS_PER_THREAD;
-  static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
+  static constexpr int VECTOR_LOAD_LENGTH = _CUDA_VSTD::min(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
 
   // Can vectorize according to the policy if the input iterator is a native
   // pointer to a primitive type

@@ -151,11 +151,12 @@ class BlockExchange
                                                                                   // C++14
   static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
 
-  static constexpr int TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD;
-  static constexpr int TIME_SLICES         = WARP_TIME_SLICING ? WARPS : 1;
-  static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
-  static constexpr int TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
-  static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS);
+  static constexpr int TILE_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1;
+  static constexpr int TIME_SLICED_THREADS =
+    WARP_TIME_SLICING ? _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
+  static constexpr int TIME_SLICED_ITEMS        = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+  static constexpr int WARP_TIME_SLICED_THREADS = _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS);
   static constexpr int WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;
 
   // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise

@@ -242,7 +242,7 @@ private:
     LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
 
     // Always at least one lane
-    LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
+    LOG_COUNTER_LANES = _CUDA_VSTD::max((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
     COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
 
     // The number of packed counters per thread (plus one for padding)
@@ -254,7 +254,7 @@ public:
   enum
   {
     /// Number of bin-starting offsets tracked per thread
-    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
   };
 
 private:
@@ -587,7 +587,7 @@ public:
   enum
   {
     /// Number of bin-starting offsets tracked per thread
-    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
   };
 
 private:

@@ -431,7 +431,7 @@ private:
     // Radix sorting passes
     while (true)
     {
-      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
       auto digit_extractor =
         traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
 
@@ -510,7 +510,7 @@ public:
     // Radix sorting passes
     while (true)
     {
-      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
       auto digit_extractor =
         traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
 

@@ -81,7 +81,7 @@ struct BlockRakingLayout
     SHARED_ELEMENTS = BLOCK_THREADS,
 
     /// Maximum number of warp-synchronous raking threads
-    MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)),
+    MAX_RAKING_THREADS = _CUDA_VSTD::min(BLOCK_THREADS, CUB_WARP_THREADS(0)),
 
     /// Number of raking elements per warp-synchronous raking thread (rounded up)
     SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,

@@ -175,7 +175,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_
   enum
   {
     // Maximum CUDA vector size is 4 elements
-    MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+    MAX_VEC_SIZE = _CUDA_VSTD::min(4, ITEMS_PER_THREAD),
 
     // Vector size must be a power of two and an even divisor of the items per thread
     VEC_SIZE =

@@ -96,7 +96,7 @@ struct BlockReduceRakingCommutativeOnly
     RAKING_THREADS = WARP_THREADS,
 
     /// Number of threads actually sharing items with the raking threads
-    SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+    SHARING_THREADS = _CUDA_VSTD::max(1, BLOCK_THREADS - RAKING_THREADS),
 
     /// Number of raking elements per warp synchronous raking thread
     SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,

@@ -84,7 +84,7 @@ struct BlockReduceWarpReductions
     WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
     /// The logical warp size for warp reductions
-    LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+    LOGICAL_WARP_SIZE = _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS),
 
     /// Whether or not the logical warp size evenly divides the thread block size
     EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)

@@ -362,9 +362,11 @@ struct dispatch_histogram
       // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
       int pixels_per_tile = block_threads * pixels_per_thread;
       int tiles_per_row   = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
-      int blocks_per_row  = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+      int blocks_per_row  = _CUDA_VSTD::min(histogram_sweep_occupancy, tiles_per_row);
       int blocks_per_col =
-        (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0;
+        (blocks_per_row > 0)
+          ? int(_CUDA_VSTD::min(static_cast<OffsetT>(histogram_sweep_occupancy / blocks_per_row), num_rows))
+          : 0;
       int num_thread_blocks = blocks_per_row * blocks_per_col;
 
       dim3 sweep_grid_dims;

@@ -275,7 +275,7 @@ struct DispatchRadixSort
     cudaError error = cudaSuccess;
     do
     {
-      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+      int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log upsweep_kernel configuration
 #ifdef CUB_DEBUG_LOG
@@ -447,7 +447,7 @@ struct DispatchRadixSort
         max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
 
         even_share.DispatchInit(
-          num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+          num_items, max_downsweep_grid_size, _CUDA_VSTD::max(downsweep_config.tile_size, upsweep_config.tile_size));
 
       } while (0);
       return error;
@@ -472,8 +472,8 @@ struct DispatchRadixSort
     constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
     int num_passes                        = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
     OffsetT num_portions                  = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
-    PortionOffsetT max_num_blocks =
-      ::cuda::ceil_div(static_cast<int>(CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
+    PortionOffsetT max_num_blocks         = ::cuda::ceil_div(
+      static_cast<int>(_CUDA_VSTD::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
 
     size_t value_size         = KEYS_ONLY ? 0 : sizeof(ValueT);
     size_t allocation_sizes[] = {
@@ -611,11 +611,11 @@ struct DispatchRadixSort
 
       for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
       {
-        int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
+        int num_bits = _CUDA_VSTD::min(end_bit - current_bit, RADIX_BITS);
         for (OffsetT portion = 0; portion < num_portions; ++portion)
         {
           PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
-            CUB_MIN(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
+            _CUDA_VSTD::min(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
 
           PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
 
@@ -777,7 +777,7 @@ struct DispatchRadixSort
       }
 
       // Get maximum spine length
-      int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+      int max_grid_size = _CUDA_VSTD::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
       int spine_length  = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
 
       // Temporary storage allocation requirements
@@ -812,7 +812,7 @@ struct DispatchRadixSort
       int num_passes         = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
       bool is_num_passes_odd = num_passes & 1;
       int max_alt_passes     = (num_passes * pass_config.radix_bits) - num_bits;
-      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+      int alt_end_bit        = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
 
       // Alias the temporary storage allocations
       OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
@@ -1241,7 +1241,7 @@ struct DispatchSegmentedRadixSort
     cudaError error = cudaSuccess;
     do
     {
-      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+      int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));
 
 // Log kernel configuration
 #ifdef CUB_DEBUG_LOG
@@ -1381,10 +1381,10 @@ struct DispatchSegmentedRadixSort
       int radix_bits         = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
       int alt_radix_bits     = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
       int num_bits           = end_bit - begin_bit;
-      int num_passes         = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1);
+      int num_passes         = _CUDA_VSTD::max(::cuda::ceil_div(num_bits, radix_bits), 1);
       bool is_num_passes_odd = num_passes & 1;
       int max_alt_passes     = (num_passes * radix_bits) - num_bits;
-      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+      int alt_end_bit        = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
 
       DoubleBuffer<KeyT> d_keys_remaining_passes(
         (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),