Skip to content

Commit

Permalink
Merge branch 'main' into stf_small_vector
Browse files Browse the repository at this point in the history
  • Loading branch information
caugonnet authored Feb 11, 2025
2 parents 5f3dd0a + a1e5711 commit 902021f
Show file tree
Hide file tree
Showing 1,753 changed files with 2,140 additions and 5,891 deletions.
6 changes: 3 additions & 3 deletions docs/libcudacxx/extended_api/math.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ Math

* - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>`
- Ceiling division
- CCCL 2.6.0 / CUDA 12.6
- CCCL 2.7.0 / CUDA 12.8

* - :ref:`round_up <libcudacxx-extended-api-math-round-up>`
- Round to the next multiple
- Round up to the next multiple
- CCCL 2.9.0 / CUDA 12.9

* - :ref:`round_down <libcudacxx-extended-api-math-round-down>`
- Round to the previous multiple
- Round down to the previous multiple
- CCCL 2.9.0 / CUDA 12.9
81 changes: 47 additions & 34 deletions docs/libcudacxx/extended_api/math/ceil_div.rst
Original file line number Diff line number Diff line change
@@ -1,54 +1,67 @@
.. _libcudacxx-extended-api-math-ceil-div:

Math
=====
``ceil_div`` Ceiling division
=============================

.. code:: cuda
template <typename T, typename U>
[[nodiscard]] __host__ __device__ inline
constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
constexpr cuda::std::common_type_t<T, U> ceil_div(T value, U divisor) noexcept;
ceil_div
---------
The function computes the ceiling division between two integral or enumerator values :math:`ceil(\frac{value}{base\_multiple})`.

- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
**Parameters**

**Performance considerations**

- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
- ``value``: The value to be divided.
- ``divisor``: The divisor.

**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
**Return value**

.. code:: cuda
Divides ``value`` by ``divisor``. If ``value`` is not a multiple of ``divisor`` rounds the result up to the next integer value.

#include <vector>
#include <cuda/cmath>
**Preconditions**

__global__ void vscale(int n, float s, float *x) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) x[i] *= s;
}
- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
- *Run-time*: ``value >= 0`` and ``divisor > 0``.

int main() {
const int n = 100000;
const float s = 2.f;
std::vector<float> x(n, 1.f);
**Performance considerations**

// Given a fixed number of threads per block...
constexpr int threads_per_block = 256;
- The function computes ``(value + divisor - 1) / divisor`` when the common type is a signed integer.
- The function computes ``min(value, 1 + ((value - 1) / divisor)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(value / divisor) + ((value / divisor) * divisor != value)``, especially for 64-bit types.

// ...dividing some "n" by "threads_per_block" may lead to a remainder,
// requiring the kernel to be launched with an extra thread block to handle it.
const int thread_blocks = cuda::ceil_div(n, threads_per_block);
Example
-------

vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
cudaDeviceSynchronize();
This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:

return 0;
}
.. code:: cuda
`See it on Godbolt TODO`
#include <cuda/cmath>
#include <cuda/std/span>
#include <thrust/device_vector.h>
__global__ void vector_scale_kernel(cuda::std::span<float> span, float scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < span.size())
span[index] *= scale;
}
int main() {
int num_items = 100'000;
float scale = 2.f;
thrust::device_vector<float> d_vector(num_items, 1.f);
// Given a fixed number of threads per block...
constexpr int threads_per_block = 256;
// ...dividing some "n" by "threads_per_block" may lead to a remainder,
// requiring the kernel to be launched with an extra thread block to handle it.
auto num_thread_blocks = cuda::ceil_div(num_items, threads_per_block);
auto d_ptr = thrust::raw_pointer_cast(d_vector.data());
cuda::std::span<float> d_span(d_ptr, num_items);
vector_scale_kernel<<<num_thread_blocks, threads_per_block>>>(d_span, scale);
cudaDeviceSynchronize();
return 0;
}
`See it on Godbolt 🔗 <https://godbolt.org/z/hbxscWGT9>`_
48 changes: 31 additions & 17 deletions docs/libcudacxx/extended_api/math/round_down.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,48 @@

.. code:: cuda
template <typename T, typename = U>
template <typename T, typename U>
[[nodiscard]] __host__ __device__ inline
constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept;
``value``: The value to be rounded down.
``base_multiple``: The base multiple to which the value rounds down.
The function computes the round down to the largest multiple of an integral or enumerator value :math:`floor(\frac{value}{base\_multiple}) * base\_multiple`

- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
**Parameters**

.. note::
- ``value``: The value to be rounded down.
- ``base_multiple``: The base multiple to which the value rounds down.

The function requires C++17 onwards
**Return value**

**Performance considerations**:
``value`` rounded down to the largest multiple of ``base_multiple`` less than or equal to ``value``. If ``value`` is already a multiple of ``base_multiple``, returns ``value``.

- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer
**Preconditions**

**Example**:
- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
- *Run-time*: ``value >= 0`` and ``base_multiple > 0``.

**Performance considerations**

- The function performs a truncation division followed by a multiplication. It provides better performance than ``(value / base_multiple) * base_multiple`` when the common type is a signed integer

Example
-------

.. code:: cuda
#include <cuda/cmath>
#include <cuda/cmath>
#include <cstdio>
__global__ void round_up_kernel() {
int value = 7;
unsigned multiple = 3;
printf("%d\n", cuda::round_down(value, multiple)); // print "6"
}
__global__ void example_kernel(int a, unsigned b, unsigned* result) {
// a = 7, b = 3 -> result = 6
*result = cuda::round_down(a, b);
}
int main() {
round_up_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
`See it on Godbolt TODO`
`See it on Godbolt 🔗 <https://godbolt.org/z/9vcxo3d8j>`_
50 changes: 33 additions & 17 deletions docs/libcudacxx/extended_api/math/round_up.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,52 @@

.. code:: cuda
template <typename T, typename = U>
template <typename T, typename U>
[[nodiscard]] __host__ __device__ inline
constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept;
``value``: The value to be rounded up.
``base_multiple``: The base multiple to which the value rounds up.
The function computes the round up to the smallest multiple of an integral or enumerator value :math:`ceil(\frac{value}{base\_multiple}) * base\_multiple`

- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of
``a`` and ``b``. The condition is checked in debug mode.
**Parameters**

- ``value``: The value to be rounded up.
- ``base_multiple``: The base multiple to which the value rounds up.

**Return value**

``value`` rounded up to the smallest multiple of ``base_multiple`` greater than or equal to ``value``. If ``value`` is already a multiple of ``base_multiple``, return ``value``.

.. note::

The function requires C++17 onwards
The result can overflow if ``ceil(value / base_multiple) * base_multiple`` exceeds the maximum value of the common type of ``value`` and ``base_multiple``. The condition is checked in debug mode.

**Performance considerations**:
**Preconditions**

- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
- *Run-time*: ``value >= 0`` and ``base_multiple > 0``.

**Performance considerations**

- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication

**Example**:
Example
-------

.. code:: cuda
#include <cuda/cmath>
#include <cuda/cmath>
#include <cstdio>
__global__ void round_up_kernel() {
int value = 7;
unsigned multiple = 3;
printf("%d\n", cuda::round_up(value, multiple)); // print "9"
}
__global__ void example_kernel(int a, unsigned b, unsigned* result) {
// a = 7, b = 3 -> result = 9
*result = cuda::round_up(a, b);
}
int main() {
round_up_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
`See it on Godbolt TODO`
`See it on Godbolt 🔗 <https://godbolt.org/z/9vcxo3d8j>`_
19 changes: 8 additions & 11 deletions libcudacxx/include/cuda/__cmath/round_down.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,14 @@
# pragma system_header
#endif // no system header

#if _CCCL_STD_VER >= 2017

# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__type_traits/common_type.h>
# include <cuda/std/__type_traits/is_enum.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_signed.h>
# include <cuda/std/__type_traits/make_unsigned.h>
# include <cuda/std/__utility/to_underlying.h>
# include <cuda/std/limits>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/common_type.h>
#include <cuda/std/__type_traits/is_enum.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_signed.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/__utility/to_underlying.h>
#include <cuda/std/limits>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

Expand Down Expand Up @@ -99,5 +97,4 @@ round_down(const _Tp __a, const _Up __b) noexcept

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CCCL_STD_VER >= 2017
#endif // _CUDA___CMATH_ROUND_DOWN_H
21 changes: 9 additions & 12 deletions libcudacxx/include/cuda/__cmath/round_up.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,15 @@
# pragma system_header
#endif // no system header

#if _CCCL_STD_VER >= 2017

# include <cuda/__cmath/ceil_div.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__type_traits/common_type.h>
# include <cuda/std/__type_traits/is_enum.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_signed.h>
# include <cuda/std/__type_traits/make_unsigned.h>
# include <cuda/std/__utility/to_underlying.h>
# include <cuda/std/limits>
#include <cuda/__cmath/ceil_div.h>
#include <cuda/std/__concepts/concept_macros.h>
#include <cuda/std/__type_traits/common_type.h>
#include <cuda/std/__type_traits/is_enum.h>
#include <cuda/std/__type_traits/is_integral.h>
#include <cuda/std/__type_traits/is_signed.h>
#include <cuda/std/__type_traits/make_unsigned.h>
#include <cuda/std/__utility/to_underlying.h>
#include <cuda/std/limits>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

Expand Down Expand Up @@ -101,5 +99,4 @@ round_up(const _Tp __a, const _Up __b) noexcept

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CCCL_STD_VER >= 2017
#endif // _CUDA___CMATH_ROUND_UP_H
25 changes: 6 additions & 19 deletions libcudacxx/include/cuda/__functional/address_stability.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,6 @@ _CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copya
template <typename F>
struct __callable_permitting_copied_arguments : F
{
#if _CCCL_STD_VER <= 2014
template <typename G>
_LIBCUDACXX_HIDE_FROM_ABI constexpr __callable_permitting_copied_arguments(G&& g)
: F(::cuda::std::forward<G>(g))
{}
#endif // _CCCL_STD_VER <= 2014

using F::operator();
};

Expand All @@ -76,11 +69,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto proclaim_copyable_argum

// Specializations for libcu++ function objects are provided here to not pull this include into `<cuda/std/...>` headers

#if _CCCL_STD_VER >= 2017
template <typename _Fn>
struct proclaims_copyable_arguments<_CUDA_VSTD::__not_fn_t<_Fn>> : proclaims_copyable_arguments<_Fn>
{};
#endif // _CCCL_STD_VER > 2014

template <typename _Tp>
struct __has_builtin_operators
Expand Down Expand Up @@ -118,13 +109,11 @@ _LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_and);
_LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_not);
_LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_or);

#if _CCCL_STD_VER >= 2017

# define _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(functor) \
/*we do not know what equal_to etc. does, which depends on the types and their operator== it is invoked on */ \
template <> \
struct proclaims_copyable_arguments<functor> : _CUDA_VSTD::false_type \
{};
#define _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(functor) \
/*we do not know what equal_to etc. does, which depends on the types and their operator== it is invoked on */ \
template <> \
struct proclaims_copyable_arguments<functor> : _CUDA_VSTD::false_type \
{};

_LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::equal_to);
_LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::not_equal_to);
Expand All @@ -133,9 +122,7 @@ _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::less_equal);
_LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::greater);
_LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::greater_equal);

# undef _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS

#endif // _CCCL_STD_VER >= 2017
#undef _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS

_LIBCUDACXX_END_NAMESPACE_CUDA

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA

extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
template <typename _Group>
inline __device__ void __cp_async_bulk_shared_global(
inline _CCCL_DEVICE void __cp_async_bulk_shared_global(
const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
Expand Down
Loading

0 comments on commit 902021f

Please sign in to comment.