-
Notifications
You must be signed in to change notification settings - Fork 190
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into stf_small_vector
- Loading branch information
Showing
1,753 changed files
with
2,140 additions
and
5,891 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,67 @@ | ||
.. _libcudacxx-extended-api-math-ceil-div: | ||
|
||
Math | ||
===== | ||
``ceil_div`` Ceiling division | ||
============================= | ||
|
||
.. code:: cuda | ||
template <typename T, typename U> | ||
[[nodiscard]] __host__ __device__ inline | ||
constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> ceil_div(T a, U b) noexcept; | ||
constexpr cuda::std::common_type_t<T, U> ceil_div(T value, U divisor) noexcept; | ||
ceil_div | ||
--------- | ||
The function computes the ceiling division between two integral or enumerator values :math:`ceil(\frac{value}{base\_multiple})`. | ||
|
||
- *Requires*: ``T`` is an integral type (including 128-bit integers) or enumerator. | ||
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. | ||
- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. | ||
**Parameters** | ||
|
||
**Performance considerations** | ||
|
||
- The function computes ``(a + b - 1) / b`` when the common type is a signed integer. | ||
- The function computes ``min(a, 1 + ((a - 1) / b)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(a / b) + ((a / b) * b != a)``, especially for 64-bit types. | ||
- ``value``: The value to be divided. | ||
- ``divisor``: The divisor. | ||
|
||
**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: | ||
**Return value** | ||
|
||
.. code:: cuda | ||
Divides ``value`` by ``divisor``. If ``value`` is not a multiple of ``divisor`` rounds the result up to the next integer value. | ||
|
||
#include <vector> | ||
#include <cuda/cmath> | ||
**Preconditions** | ||
|
||
__global__ void vscale(int n, float s, float *x) { | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i < n) x[i] *= s; | ||
} | ||
- *Compile-time*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. | ||
- *Run-time*: ``value >= 0`` and ``divisor > 0``. | ||
|
||
int main() { | ||
const int n = 100000; | ||
const float s = 2.f; | ||
std::vector<float> x(n, 1.f); | ||
**Performance considerations** | ||
|
||
// Given a fixed number of threads per block... | ||
constexpr int threads_per_block = 256; | ||
- The function computes ``(value + divisor - 1) / divisor`` when the common type is a signed integer. | ||
- The function computes ``min(value, 1 + ((value - 1) / divisor)`` when the common type is an unsigned integer in CUDA, which generates less instructions than ``(value / divisor) + ((value / divisor) * divisor != value)``, especially for 64-bit types. | ||
|
||
// ...dividing some "n" by "threads_per_block" may lead to a remainder, | ||
// requiring the kernel to be launched with an extra thread block to handle it. | ||
const int thread_blocks = cuda::ceil_div(n, threads_per_block); | ||
Example | ||
------- | ||
|
||
vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data()); | ||
cudaDeviceSynchronize(); | ||
This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: | ||
|
||
return 0; | ||
} | ||
.. code:: cuda | ||
`See it on Godbolt TODO` | ||
#include <cuda/cmath> | ||
#include <cuda/std/span> | ||
#include <thrust/device_vector.h> | ||
__global__ void vector_scale_kernel(cuda::std::span<float> span, float scale) { | ||
int index = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (index < span.size()) | ||
span[index] *= scale; | ||
} | ||
int main() { | ||
int num_items = 100'000; | ||
float scale = 2.f; | ||
thrust::device_vector<float> d_vector(num_items, 1.f); | ||
// Given a fixed number of threads per block... | ||
constexpr int threads_per_block = 256; | ||
// ...dividing some "n" by "threads_per_block" may lead to a remainder, | ||
// requiring the kernel to be launched with an extra thread block to handle it. | ||
auto num_thread_blocks = cuda::ceil_div(num_items, threads_per_block); | ||
auto d_ptr = thrust::raw_pointer_cast(d_vector.data()); | ||
cuda::std::span<float> d_span(d_ptr, num_items); | ||
vector_scale_kernel<<<num_thread_blocks, threads_per_block>>>(d_span, scale); | ||
cudaDeviceSynchronize(); | ||
return 0; | ||
} | ||
`See it on Godbolt 🔗 <https://godbolt.org/z/hbxscWGT9>`_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.