Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop deprecated CUB macros #3821

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions c/parallel/src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <cub/grid/grid_even_share.cuh>
#include <cub/util_device.cuh>

#include <cuda/std/__algorithm/clamp.h>
#include <cuda/std/cstdint>
#include <cuda/std/functional>
#include <cuda/std/variant>
Expand Down Expand Up @@ -100,8 +101,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);

// Implement part of MemBoundScaling
items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);

return {block_size, items_per_thread, vector_load_length};
}
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct policy_hub_t
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/select/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ struct policy_hub_t
{
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;

static constexpr int ITEMS_PER_THREAD =
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));

using SelectIfPolicyT =
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
Expand Down
13 changes: 7 additions & 6 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/cmath>
#include <cuda/std/type_traits>

#include <cstdint>
Expand Down Expand Up @@ -171,9 +172,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE PointerRange<VectorT>
GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
{
// Data type size used for vectorized stores
constexpr size_t out_datatype_size = sizeof(VectorT);
constexpr auto out_datatype_size = uint32_t{sizeof(VectorT)};
// Data type size used for type-aliased loads
constexpr size_t in_datatype_size = sizeof(uint32_t);
constexpr auto in_datatype_size = uint32_t{sizeof(uint32_t)};

// char-aliased ptrs to simplify pointer arithmetic
char* out_ptr = reinterpret_cast<char*>(out_begin);
Expand All @@ -194,8 +195,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
uint32_t in_offset_req = in_extra_bytes;

// Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin`
uint32_t out_start_aligned =
CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size;
uint32_t out_start_aligned = cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);

// Compute the beginning of the aligned ranges (output and input pointers)
VectorT* out_aligned_begin = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
Expand Down Expand Up @@ -398,7 +398,7 @@ private:
static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM;

/// The number of backing data types required to store the given number of items
static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NumItems, ITEMS_PER_UNIT);
static constexpr uint32_t NUM_TOTAL_UNITS = ::cuda::ceil_div(NumItems, ITEMS_PER_UNIT);

/// This is the net number of bit-storage provided by each unit (remainder bits are unused)
static constexpr uint32_t UNIT_MASK =
Expand Down Expand Up @@ -805,7 +805,8 @@ private:
if (blev_buffer_offset < num_blev_buffers)
{
BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id], BLOCK_LEVEL_TILE_SIZE);
block_offset[i] =
::cuda::ceil_div(static_cast<BlockBufferOffsetT>(tile_buffer_sizes[tile_buffer_id]), BLOCK_LEVEL_TILE_SIZE);
}
else
{
Expand Down
23 changes: 10 additions & 13 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct AgentRadixSortHistogramPolicy
* ID. However, lanes with the same ID in different warp use the same private
* histogram. This arrangement helps reduce the degree of conflicts in atomic
* operations. */
NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
NUM_PARTS = _CUDA_VSTD::max(1, NOMINAL_4B_NUM_PARTS * 4 / _CUDA_VSTD::max(int{sizeof(ComputeT)}, 4)),
RADIX_BITS = _RADIX_BITS,
};
};
Expand Down Expand Up @@ -94,16 +94,13 @@ template <typename AgentRadixSortHistogramPolicy,
struct AgentRadixSortHistogram
{
// constants
enum
{
ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS,
};
static constexpr int ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD;
static constexpr int BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS;
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS;
static constexpr int RADIX_DIGITS = 1 << RADIX_BITS;
static constexpr int MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS;
static constexpr int NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS;

using traits = radix::traits_t<KeyT>;
using bit_ordered_type = typename traits::bit_ordered_type;
Expand Down Expand Up @@ -210,7 +207,7 @@ struct AgentRadixSortHistogram
#pragma unroll
for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
{
int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
int num_bits = _CUDA_VSTD::min(+RADIX_BITS, end_bit - current_bit);
#pragma unroll
for (int u = 0; u < ITEMS_PER_THREAD; ++u)
{
Expand Down Expand Up @@ -258,7 +255,7 @@ struct AgentRadixSortHistogram

// Process the tiles.
OffsetT portion_offset = portion * MAX_PORTION_SIZE;
OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
OffsetT portion_size = _CUDA_VSTD::min(MAX_PORTION_SIZE, num_items - portion_offset);
for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
{
OffsetT tile_offset = portion_offset + offset;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,17 @@ struct AgentRadixSortUpsweep
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,

LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
LOG_COUNTER_LANES = _CUDA_VSTD::max(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
COUNTER_LANES = 1 << LOG_COUNTER_LANES,

// To prevent counter overflow, we must periodically unpack and aggregate the
// digit counters back into registers. Each counter lane is assigned to a
// warp for aggregation.

LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
LANES_PER_WARP = _CUDA_VSTD::max(1, (COUNTER_LANES + WARPS - 1) / WARPS),

// Unroll tiles in batches without risk of counter overflow
UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
UNROLL_COUNT = _CUDA_VSTD::min(64, 255 / KEYS_PER_THREAD),
UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
};

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ struct AgentReduce
static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD;
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
static constexpr int VECTOR_LOAD_LENGTH = _CUDA_VSTD::min(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);

// Can vectorize according to the policy if the input iterator is a native
// pointer to a primitive type
Expand Down
11 changes: 6 additions & 5 deletions cub/cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,12 @@ class BlockExchange
// C++14
static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);

static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1;
static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS);
static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1;
static constexpr int TIME_SLICED_THREADS =
WARP_TIME_SLICING ? _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
static constexpr int WARP_TIME_SLICED_THREADS = _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS);
static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;

// Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ private:
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,

// Always at least one lane
LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
LOG_COUNTER_LANES = _CUDA_VSTD::max((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
COUNTER_LANES = 1 << LOG_COUNTER_LANES,

// The number of packed counters per thread (plus one for padding)
Expand All @@ -254,7 +254,7 @@ public:
enum
{
/// Number of bin-starting offsets tracked per thread
BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
};

private:
Expand Down Expand Up @@ -587,7 +587,7 @@ public:
enum
{
/// Number of bin-starting offsets tracked per thread
BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
BINS_TRACKED_PER_THREAD = _CUDA_VSTD::max(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
};

private:
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ private:
// Radix sorting passes
while (true)
{
int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
auto digit_extractor =
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);

Expand Down Expand Up @@ -510,7 +510,7 @@ public:
// Radix sorting passes
while (true)
{
int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
int pass_bits = _CUDA_VSTD::min(RADIX_BITS, end_bit - begin_bit);
auto digit_extractor =
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/block/block_raking_layout.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ struct BlockRakingLayout
SHARED_ELEMENTS = BLOCK_THREADS,

/// Maximum number of warp-synchronous raking threads
MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)),
MAX_RAKING_THREADS = _CUDA_VSTD::min(BLOCK_THREADS, CUB_WARP_THREADS(0)),

/// Number of raking elements per warp-synchronous raking thread (rounded up)
SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_
enum
{
// Maximum CUDA vector size is 4 elements
MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
MAX_VEC_SIZE = _CUDA_VSTD::min(4, ITEMS_PER_THREAD),

// Vector size must be a power of two and an even divisor of the items per thread
VEC_SIZE =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ struct BlockReduceRakingCommutativeOnly
RAKING_THREADS = WARP_THREADS,

/// Number of threads actually sharing items with the raking threads
SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
SHARING_THREADS = _CUDA_VSTD::max(1, BLOCK_THREADS - RAKING_THREADS),

/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ struct BlockReduceWarpReductions
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

/// The logical warp size for warp reductions
LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
LOGICAL_WARP_SIZE = _CUDA_VSTD::min(BLOCK_THREADS, WARP_THREADS),

/// Whether or not the logical warp size evenly divides the thread block size
EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
Expand Down
6 changes: 4 additions & 2 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,11 @@ struct dispatch_histogram
// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
int pixels_per_tile = block_threads * pixels_per_thread;
int tiles_per_row = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
int blocks_per_row = _CUDA_VSTD::min(histogram_sweep_occupancy, tiles_per_row);
int blocks_per_col =
(blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0;
(blocks_per_row > 0)
? int(_CUDA_VSTD::min(static_cast<OffsetT>(histogram_sweep_occupancy / blocks_per_row), num_rows))
: 0;
int num_thread_blocks = blocks_per_row * blocks_per_col;

dim3 sweep_grid_dims;
Expand Down
22 changes: 11 additions & 11 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ struct DispatchRadixSort
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));

// Log upsweep_kernel configuration
#ifdef CUB_DEBUG_LOG
Expand Down Expand Up @@ -447,7 +447,7 @@ struct DispatchRadixSort
max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);

even_share.DispatchInit(
num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
num_items, max_downsweep_grid_size, _CUDA_VSTD::max(downsweep_config.tile_size, upsweep_config.tile_size));

} while (0);
return error;
Expand All @@ -472,8 +472,8 @@ struct DispatchRadixSort
constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
int num_passes = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
OffsetT num_portions = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
PortionOffsetT max_num_blocks =
::cuda::ceil_div(static_cast<int>(CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
PortionOffsetT max_num_blocks = ::cuda::ceil_div(
static_cast<int>(_CUDA_VSTD::min(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);

size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
size_t allocation_sizes[] = {
Expand Down Expand Up @@ -611,11 +611,11 @@ struct DispatchRadixSort

for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
{
int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
int num_bits = _CUDA_VSTD::min(end_bit - current_bit, RADIX_BITS);
for (OffsetT portion = 0; portion < num_portions; ++portion)
{
PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
CUB_MIN(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
_CUDA_VSTD::min(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));

PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);

Expand Down Expand Up @@ -777,7 +777,7 @@ struct DispatchRadixSort
}

// Get maximum spine length
int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
int max_grid_size = _CUDA_VSTD::max(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;

// Temporary storage allocation requirements
Expand Down Expand Up @@ -812,7 +812,7 @@ struct DispatchRadixSort
int num_passes = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
bool is_num_passes_odd = num_passes & 1;
int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits;
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));

// Alias the temporary storage allocations
OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
Expand Down Expand Up @@ -1241,7 +1241,7 @@ struct DispatchSegmentedRadixSort
cudaError error = cudaSuccess;
do
{
int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
int pass_bits = _CUDA_VSTD::min(pass_config.radix_bits, (end_bit - current_bit));

// Log kernel configuration
#ifdef CUB_DEBUG_LOG
Expand Down Expand Up @@ -1381,10 +1381,10 @@ struct DispatchSegmentedRadixSort
int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
int num_bits = end_bit - begin_bit;
int num_passes = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1);
int num_passes = _CUDA_VSTD::max(::cuda::ceil_div(num_bits, radix_bits), 1);
bool is_num_passes_odd = num_passes & 1;
int max_alt_passes = (num_passes * radix_bits) - num_bits;
int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
int alt_end_bit = _CUDA_VSTD::min(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));

DoubleBuffer<KeyT> d_keys_remaining_passes(
(is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
Expand Down
Loading
Loading