Merge branch 'main' into stf_small_vector

NVIDIA · Feb 11, 2025 · 902021f · 902021f
2 parents 5f3dd0a + a1e5711
commit 902021f
Show file tree

Hide file tree

Showing 1,753 changed files with 2,140 additions and 5,891 deletions.
diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
@@ -17,12 +17,12 @@ Math
 
    * - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>`
      - Ceiling division
-     - CCCL 2.6.0 / CUDA 12.6
+     - CCCL 2.7.0 / CUDA 12.8
 
    * - :ref:`round_up <libcudacxx-extended-api-math-round-up>`
-     - Round to the next multiple
+     - Round up to the next multiple
      - CCCL 2.9.0 / CUDA 12.9
 
    * - :ref:`round_down <libcudacxx-extended-api-math-round-down>`
-     - Round to the previous multiple
+     - Round down to the previous multiple
      - CCCL 2.9.0 / CUDA 12.9
diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -1,54 +1,67 @@
 .. _libcudacxx-extended-api-math-ceil-div:
 
-Math
-=====
+``ceil_div`` Ceiling division
+=============================
 
 .. code:: cuda
 
    template <typename T, typename U>
    [[nodiscard]] __host__ __device__ inline
-   constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept;
+   constexpr cuda::std::common_type_t<T, U> ceil_div(T value, U divisor) noexcept;
 
-ceil_div
----------
+The function computes the ceiling division between two integral or enumerator values :math:`ceil(\frac{value}{base\_multiple})`.
 
-- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator.
-- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
-- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
+**Parameters**
 
-**Performance considerations**
-
-- The function computes ``(a + b - 1) / b`` when the common type is a signed integer.
-- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types.
+- ``value``: The value to be divided.
+- ``divisor``: The divisor.
 
-**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
+**Return value**
 
-.. code:: cuda
+Divides ``value`` by ``divisor``. If ``value`` is not a multiple of ``divisor`` rounds the result up to the next integer value.
 
-   #include <vector>
-   #include <cuda/cmath>
+**Preconditions**
 
-   __global__ void vscale(int n, float s, float *x) {
-     int i = blockIdx.x * blockDim.x + threadIdx.x;
-     if (i < n) x[i] *= s;
-   }
+- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Run-time*: ``value >= 0`` and ``divisor > 0``.
 
-   int main() {
-     const int n = 100000;
-     const float s = 2.f;
-     std::vector<float> x(n, 1.f);
+**Performance considerations**
 
-     // Given a fixed number of threads per block...
-     constexpr int threads_per_block = 256;
+- The function computes ``(value + divisor - 1) / divisor`` when the common type is a signed integer.
+- The function computes ``min(value, 1 + ((value - 1) / divisor)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(value / divisor) + ((value / divisor) * divisor != value)``, especially for 64-bit types.
 
-     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
-     // requiring the kernel to be launched with an extra thread block to handle it.
-     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
+Example
+-------
 
-     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
-     cudaDeviceSynchronize();
+This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
 
-     return 0;
-   }
+.. code:: cuda
 
-`See it on Godbolt TODO`
+    #include <cuda/cmath>
+    #include <cuda/std/span>
+    #include <thrust/device_vector.h>
+
+    __global__ void vector_scale_kernel(cuda::std::span<float> span, float scale) {
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        if (index < span.size())
+            span[index] *= scale;
+    }
+
+    int main() {
+        int   num_items = 100'000;
+        float scale = 2.f;
+        thrust::device_vector<float> d_vector(num_items, 1.f);
+        // Given a fixed number of threads per block...
+        constexpr int threads_per_block = 256;
+        // ...dividing some "n" by "threads_per_block" may lead to a remainder,
+        // requiring the kernel to be launched with an extra thread block to handle it.
+        auto num_thread_blocks = cuda::ceil_div(num_items, threads_per_block);
+        auto d_ptr             = thrust::raw_pointer_cast(d_vector.data());
+        cuda::std::span<float> d_span(d_ptr, num_items);
+
+        vector_scale_kernel<<<num_thread_blocks, threads_per_block>>>(d_span, scale);
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/hbxscWGT9>`_
diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst
@@ -5,34 +5,48 @@
 
 .. code:: cuda
 
-   template <typename T, typename = U>
+   template <typename T, typename U>
    [[nodiscard]] __host__ __device__ inline
    constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept;
 
-``value``: The value to be rounded down.
-``base_multiple``:  The base multiple to which the value rounds down.
+The function computes the round down to the largest multiple of an integral or enumerator value :math:`floor(\frac{value}{base\_multiple}) * base\_multiple`
 
-- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
-- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
-- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+**Parameters**
 
-.. note::
+- ``value``: The value to be rounded down.
+- ``base_multiple``:  The base multiple to which the value rounds down.
 
-   The function requires C++17 onwards
+**Return value**
 
-**Performance considerations**:
+``value`` rounded down to the largest multiple of ``base_multiple`` less than or equal to ``value``. If ``value`` is already a multiple of ``base_multiple``, returns ``value``.
 
-- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer
+**Preconditions**
 
-**Example**:
+- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Run-time*: ``value >= 0`` and ``base_multiple > 0``.
+
+**Performance considerations**
+
+- The function performs a truncation division followed by a multiplication. It provides better performance than ``(value / base_multiple) * base_multiple`` when the common type is a signed integer
+
+Example
+-------
 
 .. code:: cuda
 
-   #include <cuda/cmath>
+    #include <cuda/cmath>
+    #include <cstdio>
+
+    __global__ void round_up_kernel() {
+        int      value    = 7;
+        unsigned multiple = 3;
+        printf("%d\n", cuda::round_down(value, multiple)); // print "6"
+    }
 
-   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
-     // a = 7, b = 3 -> result = 6
-     *result = cuda::round_down(a, b);
-   }
+    int main() {
+        round_up_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
 
-`See it on Godbolt TODO`
+`See it on Godbolt 🔗 <https://godbolt.org/z/9vcxo3d8j>`_
diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst
@@ -5,36 +5,52 @@
 
 .. code:: cuda
 
-   template <typename T, typename = U>
+   template <typename T, typename U>
    [[nodiscard]] __host__ __device__ inline
    constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept;
 
-``value``: The value to be rounded up.
-``base_multiple``:  The base multiple to which the value rounds up.
+The function computes the round up to the smallest multiple of an integral or enumerator value :math:`ceil(\frac{value}{base\_multiple}) * base\_multiple`
 
-- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
-- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
-- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
-- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of
-          ``a`` and ``b``. The condition is checked in debug mode.
+**Parameters**
+
+- ``value``: The value to be rounded up.
+- ``base_multiple``:  The base multiple to which the value rounds up.
+
+**Return value**
+
+``value`` rounded up to the smallest multiple of ``base_multiple`` greater than or equal to ``value``. If ``value`` is already a multiple of ``base_multiple``, return ``value``.
 
 .. note::
 
-   The function requires C++17 onwards
+    The result can overflow if ``ceil(value / base_multiple) * base_multiple`` exceeds the maximum value of the common type of ``value`` and ``base_multiple``. The condition is checked in debug mode.
 
-**Performance considerations**:
+**Preconditions**
+
+- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Run-time*: ``value >= 0`` and ``base_multiple > 0``.
+
+**Performance considerations**
 
 - The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication
 
-**Example**:
+Example
+-------
 
 .. code:: cuda
 
-   #include <cuda/cmath>
+    #include <cuda/cmath>
+    #include <cstdio>
+
+    __global__ void round_up_kernel() {
+        int      value    = 7;
+        unsigned multiple = 3;
+        printf("%d\n", cuda::round_up(value, multiple)); // print "9"
+    }
 
-   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
-     // a = 7, b = 3 -> result = 9
-     *result = cuda::round_up(a, b);
-   }
+    int main() {
+        round_up_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
 
-`See it on Godbolt TODO`
+`See it on Godbolt 🔗 <https://godbolt.org/z/9vcxo3d8j>`_
diff --git a/libcudacxx/include/cuda/__cmath/round_down.h b/libcudacxx/include/cuda/__cmath/round_down.h
@@ -21,16 +21,14 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_STD_VER >= 2017
-
-#  include <cuda/std/__concepts/concept_macros.h>
-#  include <cuda/std/__type_traits/common_type.h>
-#  include <cuda/std/__type_traits/is_enum.h>
-#  include <cuda/std/__type_traits/is_integral.h>
-#  include <cuda/std/__type_traits/is_signed.h>
-#  include <cuda/std/__type_traits/make_unsigned.h>
-#  include <cuda/std/__utility/to_underlying.h>
-#  include <cuda/std/limits>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/common_type.h>
+#include <cuda/std/__type_traits/is_enum.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__utility/to_underlying.h>
+#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -99,5 +97,4 @@ round_down(const _Tp __a, const _Up __b) noexcept
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
-#endif // _CCCL_STD_VER >= 2017
 #endif // _CUDA___CMATH_ROUND_DOWN_H
diff --git a/libcudacxx/include/cuda/__cmath/round_up.h b/libcudacxx/include/cuda/__cmath/round_up.h
@@ -21,17 +21,15 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_STD_VER >= 2017
-
-#  include <cuda/__cmath/ceil_div.h>
-#  include <cuda/std/__concepts/concept_macros.h>
-#  include <cuda/std/__type_traits/common_type.h>
-#  include <cuda/std/__type_traits/is_enum.h>
-#  include <cuda/std/__type_traits/is_integral.h>
-#  include <cuda/std/__type_traits/is_signed.h>
-#  include <cuda/std/__type_traits/make_unsigned.h>
-#  include <cuda/std/__utility/to_underlying.h>
-#  include <cuda/std/limits>
+#include <cuda/__cmath/ceil_div.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/common_type.h>
+#include <cuda/std/__type_traits/is_enum.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/make_unsigned.h>
+#include <cuda/std/__utility/to_underlying.h>
+#include <cuda/std/limits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -101,5 +99,4 @@ round_up(const _Tp __a, const _Up __b) noexcept
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
-#endif // _CCCL_STD_VER >= 2017
 #endif // _CUDA___CMATH_ROUND_UP_H
diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h
@@ -48,13 +48,6 @@ _CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copya
 template <typename F>
 struct __callable_permitting_copied_arguments : F
 {
-#if _CCCL_STD_VER <= 2014
-  template <typename G>
-  _LIBCUDACXX_HIDE_FROM_ABI constexpr __callable_permitting_copied_arguments(G&& g)
-      : F(::cuda::std::forward<G>(g))
-  {}
-#endif // _CCCL_STD_VER <= 2014
-
   using F::operator();
 };
 
@@ -76,11 +69,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto proclaim_copyable_argum
 
 // Specializations for libcu++ function objects are provided here to not pull this include into `<cuda/std/...>` headers
 
-#if _CCCL_STD_VER >= 2017
 template <typename _Fn>
 struct proclaims_copyable_arguments<_CUDA_VSTD::__not_fn_t<_Fn>> : proclaims_copyable_arguments<_Fn>
 {};
-#endif // _CCCL_STD_VER > 2014
 
 template <typename _Tp>
 struct __has_builtin_operators
@@ -118,13 +109,11 @@ _LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_and);
 _LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_not);
 _LIBCUDACXX_MARK_CAN_COPY_ARGUMENTS(_CUDA_VSTD::logical_or);
 
-#if _CCCL_STD_VER >= 2017
-
-#  define _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(functor)                                              \
-    /*we do not know what equal_to etc. does, which depends on the types and their operator== it is invoked on */ \
-    template <>                                                                                                   \
-    struct proclaims_copyable_arguments<functor> : _CUDA_VSTD::false_type                                         \
-    {};
+#define _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(functor)                                              \
+  /*we do not know what equal_to etc. does, which depends on the types and their operator== it is invoked on */ \
+  template <>                                                                                                   \
+  struct proclaims_copyable_arguments<functor> : _CUDA_VSTD::false_type                                         \
+  {};
 
 _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::equal_to);
 _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::not_equal_to);
@@ -133,9 +122,7 @@ _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::less_equal);
 _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::greater);
 _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS(_CUDA_VRANGES::greater_equal);
 
-#  undef _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS
-
-#endif // _CCCL_STD_VER >= 2017
+#undef _LIBCUDACXX_MARK_RANGE_FUNCTOR_CAN_COPY_ARGUMENTS
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 

diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
-inline __device__ void __cp_async_bulk_shared_global(
+inline _CCCL_DEVICE void __cp_async_bulk_shared_global(
   const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk