From f1d714d1d91ec034e1cc37fae6f3afe7afabb29e Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Thu, 5 Dec 2024 21:10:25 -0700 Subject: [PATCH] [rocPRIM]Mergeback 6.4 hotfixes (#664) * Add atomic_add overload for double (#652) The rocprim::histogram_even algorithm uses rocprim::atomic_add. There are atomic_add overloads for all of the non-half-type primitive data types we support, with the exception of double. Because of this, if you called rocprim::histogram_even and used double as the counter type, you'd get a compilation error. This changes just adds an atomic_add overload that accepts double, and tacks on a few unit test cases that exercise histogram_even with float and double counters. * Make temp_storage functions __host__ and __device__ (#658) * Enable CMake HIP language (#655) * Restart on HIP language support * linking changes * rearrange --------- Co-authored-by: Lauren Wrubleski * Fix for rocprim::batch_copy when using iterators (#662) * Fix for using iterators in batch_copy * Add indirect_iterator test to batch_memcpy * Add specific iterator test based on reproducer, with iterators in iterators. * Fix warp dpp reduction (#660) Co-authored-by: Robin Voetter --------- Co-authored-by: Wayne Franz Co-authored-by: Nick Breed <78807921+NB4444@users.noreply.github.com> Co-authored-by: Lauren Wrubleski Co-authored-by: Beatriz Navidad Vilches <61422851+Beanavil@users.noreply.github.com> Co-authored-by: Robin Voetter --- CHANGELOG.md | 1 + CMakeLists.txt | 109 +++++--- benchmark/CMakeLists.txt | 7 +- cmake/Summary.cmake | 21 +- cmake/VerifyCompiler.cmake | 12 +- example/CMakeLists.txt | 10 +- .../block/detail/block_reduce_warp_reduce.hpp | 11 +- .../include/rocprim/detail/temp_storage.hpp | 18 +- .../device/detail/device_batch_memcpy.hpp | 8 +- rocprim/include/rocprim/intrinsics/atomic.hpp | 6 + .../rocprim/warp/detail/warp_reduce_dpp.hpp | 33 ++- test/CMakeLists.txt | 10 +- test/extra/CMakeLists.txt | 23 +- test/rocprim/CMakeLists.txt | 13 +- test/rocprim/test_device_batch_memcpy.cpp | 253 +++++++++++++----- test/rocprim/test_device_histogram.cpp | 3 +- 16 files changed, 399 insertions(+), 139 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 033a2ad5a..0db0f029e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Fixed an issue where `rocprim::partial_sort_copy` would yield a compile error if the input iterator is const. * Fixed incorrect 128-bit signed and unsigned integers type traits. * Fixed compilation issue when `rocprim::radix_key_codec<...>` is specialized with a 128-bit integer. +* Fixed the warp-level reduction `rocprim::warp_reduce.reduce` DPP implementation to avoid undefined intermediate values during the reduction. ### Upcoming changes * Using the initialisation constructor of `rocprim::reverse_iterator` will throw a deprecation warning. It will be marked as explicit in the next major release. diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f7fe9d9e..c20fe2bed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,25 @@ set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended # rocPRIM project project(rocprim LANGUAGES CXX) +# Set CXX flags +if (NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +# Set HIP flags +set(CMAKE_HIP_STANDARD 14) +set(CMAKE_HIP_STANDARD_REQUIRED ON) +set(CMAKE_HIP_EXTENSIONS OFF) + +# Set CXX standard +if (CMAKE_CXX_STANDARD EQUAL 14) + message(WARNING "C++14 will be deprecated in the next major release") +elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) + message(FATAL_ERROR "Only C++14 and C++17 are supported") +endif() + if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) else() @@ -42,17 +61,36 @@ else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() +include(CheckLanguage) +include(CMakeDependentOption) + # Build options -option(BUILD_TEST "Build tests (requires googletest)" OFF) -option(BUILD_BENCHMARK "Build benchmarks" OFF) -option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF) -option(BUILD_EXAMPLE "Build examples" OFF) -option(BUILD_DOCS "Build documentation (requires sphinx)" OFF) # Disables building tests, benchmarks, examples option(ONLY_INSTALL "Only install" OFF) +cmake_dependent_option(BUILD_TEST "Build tests (requires googletest)" OFF "NOT ONLY_INSTALL" OFF) +cmake_dependent_option(BUILD_BENCHMARK "Build benchmarks" OFF "NOT ONLY_INSTALL" OFF) +cmake_dependent_option(BUILD_EXAMPLE "Build examples" OFF "NOT ONLY_INSTALL" OFF) +option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF) +cmake_dependent_option(BUILD_DOCS "Build documentation (requires sphinx)" OFF "NOT ONLY_INSTALL" OFF) option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) option(ROCPRIM_INSTALL "Enable installation of rocPRIM (projects embedding rocPRIM may want to turn this OFF)" ON) +check_language(HIP) +cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) + +if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) +else() + set(ROCPRIM_PROJECT_IS_TOP_LEVEL FALSE) +endif() + +#Adding CMAKE_PREFIX_PATH +if(WIN32) + set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") +else() + set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") +endif() + # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake @@ -68,19 +106,6 @@ endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") -# Set CXX flags -if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 17) -endif() -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) - -if (CMAKE_CXX_STANDARD EQUAL 14) - message(WARNING "C++14 will be deprecated in the next major release") -elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) - message(FATAL_ERROR "Only C++14 and C++17 are supported") -endif() - if(DEFINED BUILD_SHARED_LIBS) set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) else() @@ -91,26 +116,30 @@ set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared # Get dependencies (required here to get rocm-cmake) include(cmake/Dependencies.cmake) # Use target ID syntax if supported for GPU_TARGETS -if (NOT DEFINED AMDGPU_TARGETS) - set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") +if(USE_HIPCXX) + enable_language(HIP) else() - set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") -endif() -set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") - -if(GPU_TARGETS STREQUAL "all") - if(BUILD_ADDRESS_SANITIZER) - # ASAN builds require xnack - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" - ) + if (NOT DEFINED AMDGPU_TARGETS) + set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" - ) + set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") + endif() + set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") + + if(GPU_TARGETS STREQUAL "all") + if(BUILD_ADDRESS_SANITIZER) + # ASAN builds require xnack + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS + TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" + ) + else() + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" + ) + endif() + + set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() - - set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() # TODO: Fix VerifyCompiler for HIP on Windows @@ -147,12 +176,12 @@ print_configuration_summary() # rocPRIM library add_subdirectory(rocprim) -if(ROCPRIM_PROJECT_IS_TOP_LEVEL AND NOT ONLY_INSTALL AND (BUILD_TEST OR BUILD_BENCHMARK)) +if(ROCPRIM_PROJECT_IS_TOP_LEVEL AND (BUILD_TEST OR BUILD_BENCHMARK)) rocm_package_setup_component(clients) endif() # Tests -if(BUILD_TEST AND NOT ONLY_INSTALL) +if(BUILD_TEST) if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(tests) endif() @@ -161,7 +190,7 @@ if(BUILD_TEST AND NOT ONLY_INSTALL) endif() # Benchmarks -if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) +if(BUILD_BENCHMARK) if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(benchmarks) endif() @@ -169,12 +198,12 @@ if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) endif() # Examples -if(BUILD_EXAMPLE AND NOT ONLY_INSTALL) +if(BUILD_EXAMPLE) add_subdirectory(example) endif() # Docs -if(BUILD_DOCS AND NOT ONLY_INSTALL) +if(BUILD_DOCS) add_subdirectory(docs) endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index d8ef35f90..64975916d 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -34,6 +34,10 @@ endif() function(add_rocprim_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) + if(USE_HIPCXX) + set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE HIP) + endif() + if(BENCHMARK_CONFIG_TUNING) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${BENCHMARK_TARGET}.parallel.cpp.in") message(STATUS "found ${BENCHMARK_TARGET}.parallel.cpp.in file, compiling in parallel.") @@ -77,9 +81,10 @@ function(add_rocprim_benchmark BENCHMARK_SOURCE) rocprim benchmark::benchmark ) + target_link_libraries(${BENCHMARK_TARGET} PRIVATE - rocprim_hip + $,hip::host,hip::device> ) target_compile_options(${BENCHMARK_TARGET} diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 474433306..58c286b32 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -25,10 +25,17 @@ function(print_configuration_summary) message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") - message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") - message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") - string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) - message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") + if(USE_HIPCXX) + message(STATUS " HIP compiler : ${CMAKE_HIP_COMPILER}") + message(STATUS " HIP compiler version : ${CMAKE_HIP_COMPILER_VERSION}") + string(STRIP "${CMAKE_HIP_FLAGS}" CMAKE_HIP_FLAGS_STRIP) + message(STATUS " HIP flags : ${CMAKE_HIP_FLAGS_STRIP}") + else() + message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") + message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") + string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) + message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") + endif() get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if(GENERATOR_IS_MULTI_CONFIG) message(STATUS " Build types : ${CMAKE_CONFIGURATION_TYPES}") @@ -36,7 +43,11 @@ function(print_configuration_summary) message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") endif() message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") - message(STATUS " Device targets : ${GPU_TARGETS}") + if(USE_HIPCXX) + message(STATUS " Device targets : ${CMAKE_HIP_ARCHITECTURES}") + else() + message(STATUS " Device targets : ${GPU_TARGETS}") + endif() message(STATUS "") message(STATUS " ONLY_INSTALL : ${ONLY_INSTALL}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") diff --git a/cmake/VerifyCompiler.cmake b/cmake/VerifyCompiler.cmake index c9e475e6f..98bd368f5 100644 --- a/cmake/VerifyCompiler.cmake +++ b/cmake/VerifyCompiler.cmake @@ -24,10 +24,12 @@ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm /opt/rocm/llvm /opt/rocm /opt/rocm/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) -if(HIP_COMPILER STREQUAL "clang") - if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) - message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") +if(NOT USE_HIPCXX) + if(HIP_COMPILER STREQUAL "clang") + if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) + message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") + endif() + else() + message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform)") endif() -else() - message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform)") endif() diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 91fd34d56..b9edcae00 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -22,14 +22,20 @@ function(add_rocprim_example EXAMPLE_SOURCE) get_filename_component(EXAMPLE_TARGET ${EXAMPLE_SOURCE} NAME_WE) + + if(USE_HIPCXX) + set_source_files_properties(${EXAMPLE_SOURCE} PROPERTIES LANGUAGES HIP) + endif() + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCE}) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE - rocprim_hip + rocprim ) target_link_libraries(${EXAMPLE_TARGET} PRIVATE - rocprim_hip + $,hip::host,hip::device> ) set_target_properties(${EXAMPLE_TARGET} PROPERTIES diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp index 2ffc7437d..3f9d98321 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp @@ -190,14 +190,15 @@ class block_reduce_warp_reduce } ::rocprim::syncthreads(); - if(flat_tid < warps_no_) + if(warp_id == 0) { // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; + auto warp_partial = storage_.warp_partials[lane_id % warps_no_]; - warp_reduce( - warp_partial, output, warps_no_, reduce_op - ); + warp_reduce(warp_partial, + output, + warps_no_, + reduce_op); } } } diff --git a/rocprim/include/rocprim/detail/temp_storage.hpp b/rocprim/include/rocprim/detail/temp_storage.hpp index f9e2a57d6..d1c27b747 100644 --- a/rocprim/include/rocprim/detail/temp_storage.hpp +++ b/rocprim/include/rocprim/detail/temp_storage.hpp @@ -65,6 +65,7 @@ struct simple_partition layout storage_layout; /// Compute the required layout for this type and return it. + ROCPRIM_HOST_DEVICE layout get_layout() const { return this->storage_layout; @@ -73,6 +74,7 @@ struct simple_partition /// \brief Assigns the final storage for this partition. `storage` is assumed to have the required /// alignment and size as described by the layout returned by `get_layout()`. /// \param storage - Base pointer to the storage to be used for this partition. + ROCPRIM_HOST_DEVICE void set_storage(void* const storage) { *this->dest = this->storage_layout.size == 0 ? nullptr : static_cast(storage); @@ -84,6 +86,7 @@ struct simple_partition /// \param dest - Pointer to where to store the final allocated pointer /// \param storage_layout - The required layout that the memory allocated to `*dest` should have. template +ROCPRIM_HOST_DEVICE simple_partition make_partition(T** dest, layout storage_layout) { return simple_partition{dest, storage_layout}; @@ -95,6 +98,7 @@ simple_partition make_partition(T** dest, layout storage_layout) /// \param size - The required size that the memory allocated to `*dest` should have. /// \param alignment - The required alignment that the memory allocated to `*dest` should have. template +ROCPRIM_HOST_DEVICE simple_partition make_partition(T** dest, size_t size, size_t alignment = default_alignment) { return make_partition(dest, {size, alignment}); @@ -106,6 +110,7 @@ simple_partition make_partition(T** dest, size_t size, size_t alignment = def /// \param dest - Pointer to where to store the final allocated pointer /// \param elements - The number of elements of `T` that the memory allocated to `dest` should consist of. template +ROCPRIM_HOST_DEVICE simple_partition ptr_aligned_array(T** dest, size_t elements) { return make_partition(dest, elements * sizeof(T), alignof(T)); @@ -126,9 +131,11 @@ struct linear_partition ::rocprim::tuple sub_partitions; /// \brief Constructor. - linear_partition(Ts... sub_partitions) : sub_partitions{sub_partitions...} {} + ROCPRIM_HOST_DEVICE linear_partition(Ts... sub_partitions) : sub_partitions{sub_partitions...} + {} /// Compute the required layout for this type and return it. + ROCPRIM_HOST_DEVICE layout get_layout() const { size_t required_alignment = 1; @@ -153,6 +160,7 @@ struct linear_partition /// \brief Assigns the final storage for this partition. `storage` is assumed to have the required /// alignment and size as described by the layout returned by `get_layout()`. /// \param storage - Base pointer to the storage to be used for this partition. + ROCPRIM_HOST_DEVICE void set_storage(void* const storage) { size_t offset = 0; @@ -175,6 +183,7 @@ struct linear_partition /// \tparam Ts - The sub-partitions to allocate temporary memory for. /// \see linear_partition template +ROCPRIM_HOST_DEVICE linear_partition make_linear_partition(Ts... ts) { return linear_partition(ts...); @@ -194,9 +203,11 @@ struct union_partition ::rocprim::tuple sub_partitions; /// \brief Constructor. - union_partition(Ts... sub_partitions) : sub_partitions{sub_partitions...} {} + ROCPRIM_HOST_DEVICE union_partition(Ts... sub_partitions) : sub_partitions{sub_partitions...} + {} /// Compute the required layout for this type and return it. + ROCPRIM_HOST_DEVICE layout get_layout() const { size_t required_alignment = 1; @@ -218,6 +229,7 @@ struct union_partition /// \brief Assigns the final storage for this partition. `storage` is assumed to have the required /// alignment and size as described by the layout returned by `get_layout()`. /// \param storage - Base pointer to the storage to be used for this partition. + ROCPRIM_HOST_DEVICE void set_storage(void* const storage) { for_each_in_tuple(this->sub_partitions, @@ -229,6 +241,7 @@ struct union_partition /// \tparam Ts - The sub-partitions to allocate temporary memory for. /// \see union_partition template +ROCPRIM_HOST_DEVICE union_partition make_union_partition(Ts... ts) { return union_partition(ts...); @@ -261,6 +274,7 @@ union_partition make_union_partition(Ts... ts) /// \param storage_size [in,out] - The size of `temporary_storage`. /// \param partition [in,out] - The root partition to allocate temporary memory to. template +ROCPRIM_HOST_DEVICE hipError_t partition(void* const temporary_storage, size_t& storage_size, TempStoragePartition partition) { diff --git a/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp index a28a1a8a1..669ad9a61 100644 --- a/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp +++ b/rocprim/include/rocprim/device/detail/device_batch_memcpy.hpp @@ -382,10 +382,10 @@ struct batch_memcpy_impl struct copyable_blev_buffers { - InputBufferItType srcs; - OutputBufferItType dsts; - BufferSizeItType sizes; - tile_offset_type* offsets; + input_buffer_type* srcs; + output_buffer_type* dsts; + buffer_size_type* sizes; + tile_offset_type* offsets; }; private: diff --git a/rocprim/include/rocprim/intrinsics/atomic.hpp b/rocprim/include/rocprim/intrinsics/atomic.hpp index 03cd5333a..16fa34644 100644 --- a/rocprim/include/rocprim/intrinsics/atomic.hpp +++ b/rocprim/include/rocprim/intrinsics/atomic.hpp @@ -45,6 +45,12 @@ namespace detail return ::atomicAdd(address, value); } + ROCPRIM_DEVICE ROCPRIM_INLINE + double atomic_add(double * address, double value) + { + return ::atomicAdd(address, value); + } + ROCPRIM_DEVICE ROCPRIM_INLINE unsigned long atomic_add(unsigned long* address, unsigned long value) { diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp index df567ae33..649a82980 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -49,7 +49,7 @@ class warp_reduce_dpp template ROCPRIM_DEVICE ROCPRIM_INLINE - void reduce(T input, T& output, BinaryFunction reduce_op) + void reduce_impl(T input, T& output, BinaryFunction reduce_op, std::false_type) { output = input; @@ -65,13 +65,17 @@ class warp_reduce_dpp } if(WarpSize > 4) { - // row_shr:4 - output = reduce_op(warp_move_dpp(output), output); + // row_ror:4 + // Use rotation instead of shift to avoid leaving invalid values in the destination + // registers (asume warp size of at least hardware warp-size) + output = reduce_op(warp_move_dpp(output), output); } if(WarpSize > 8) { - // row_shr:8 - output = reduce_op(warp_move_dpp(output), output); + // row_ror:8 + // Use rotation instead of shift to avoid leaving invalid values in the destination + // registers (asume warp size of at least hardware warp-size) + output = reduce_op(warp_move_dpp(output), output); } #ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST if(WarpSize > 16) @@ -97,6 +101,23 @@ class warp_reduce_dpp output = warp_shuffle(output, WarpSize - 1, WarpSize); } + template + ROCPRIM_DEVICE ROCPRIM_INLINE + void reduce_impl(T input, T& output, BinaryFunction reduce_op, std::true_type) + { + warp_reduce_shuffle().reduce(input, output, reduce_op); + } + + template + ROCPRIM_DEVICE ROCPRIM_INLINE + void reduce(T input, T& output, BinaryFunction reduce_op) + { + reduce_impl(input, + output, + reduce_op, + std::integral_constant{}); + } + template ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 099a80678..31538a58d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -53,6 +53,10 @@ function(add_hip_test TEST_NAME TEST_SOURCES) get_filename_component(TEST_TARGET ${TEST_MAIN_SOURCE} NAME_WE) + if(USE_HIPCXX) + set_source_files_properties(${TEST_SOURCES} PROPERTIES LANGUAGE HIP) + endif() + add_executable(${TEST_TARGET} ${TEST_SOURCES}) if (ROCPRIM_INSTALL) rocm_install(TARGETS ${TEST_TARGET} COMPONENT tests) @@ -70,7 +74,11 @@ function(add_hip_test TEST_NAME TEST_SOURCES) ) target_link_libraries(${TEST_TARGET} PRIVATE - rocprim_hip + rocprim + ) + target_link_libraries(${TEST_TARGET} + PRIVATE + $,hip::host,hip::device> ) target_compile_options(${TEST_TARGET} diff --git a/test/extra/CMakeLists.txt b/test/extra/CMakeLists.txt index 23a9a6002..14fceaa71 100644 --- a/test/extra/CMakeLists.txt +++ b/test/extra/CMakeLists.txt @@ -22,9 +22,20 @@ cmake_minimum_required(VERSION 3.5.1 FATAL_ERROR) +include(CheckLanguage) +include(CMakeDependentOption) + +check_language(HIP) +cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) +if(USE_HIPCXX) + set(PROJECT_LANG HIP) +else() + set(PROJECT_LANG CXX) +endif() + # This project includes tests that should be run after # rocPRIM is installed from package or using `make install` -project(rocprim_package_install_test CXX) +project(rocprim_package_install_test ${PROJECT_LANG}) # CMake modules list(APPEND CMAKE_MODULE_PATH @@ -46,12 +57,12 @@ include(VerifyCompiler) find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") # Build CXX flags -if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 17) +if (NOT DEFINED CMAKE_${PROJECT_LANG}_STANDARD) + set(CMAKE_${PROJECT_LANG}_STANDARD 17) endif() -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") +set(CMAKE_${PROJECT_LANG}_STANDARD_REQUIRED ON) +set(CMAKE_${PROJECT_LANG}_EXTENSIONS OFF) +set(CMAKE_${PROJECT_LANG}_FLAGS "${CMAKE_${PROJECT_LANG}_FLAGS} -Wall -Wextra -Werror") if (CMAKE_CXX_STANDARD EQUAL 14) message(WARNING "C++14 will be deprecated in the next major release") diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 529b65d33..d31e8f9ce 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -35,6 +35,9 @@ function(add_rocprim_test TEST_NAME TEST_SOURCES) endfunction() function(add_rocprim_test_internal TEST_NAME TEST_SOURCES TEST_TARGET) + if(USE_HIPCXX) + set_source_files_properties(${TEST_SOURCES} PROPERTIES LANGUAGE HIP) + endif() add_executable(${TEST_TARGET} ${TEST_SOURCES}) if (ROCPRIM_INSTALL) rocm_install(TARGETS ${TEST_TARGET} COMPONENT tests) @@ -47,12 +50,14 @@ function(add_rocprim_test_internal TEST_NAME TEST_SOURCES TEST_TARGET) target_link_libraries(${TEST_TARGET} PRIVATE + rocprim GTest::GTest GTest::Main ) + target_link_libraries(${TEST_TARGET} PRIVATE - rocprim_hip + $,hip::host,hip::device> ) target_compile_options(${TEST_TARGET} @@ -60,6 +65,7 @@ function(add_rocprim_test_internal TEST_NAME TEST_SOURCES TEST_TARGET) $<$: /bigobj # number of sections exceeded object file format limit: compile with /bigobj > + $<$:$<$:/bigobj>> ) set_target_properties(${TEST_TARGET} @@ -209,6 +215,11 @@ function(add_rocprim_cpp17_test TEST_NAME TEST_SOURCES) set_target_properties(${TEST_TARGET} PROPERTIES CXX_STANDARD 17) + if(USE_HIPCXX) + set_target_properties(${TEST_TARGET} + PROPERTIES + HIP_STANDARD 17) + endif() endfunction() diff --git a/test/rocprim/test_device_batch_memcpy.cpp b/test/rocprim/test_device_batch_memcpy.cpp index ab42985d5..8527c61f1 100644 --- a/test/rocprim/test_device_batch_memcpy.cpp +++ b/test/rocprim/test_device_batch_memcpy.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "common_test_header.hpp" +#include "indirect_iterator.hpp" #include "test_utils_assertions.hpp" #include "test_utils_custom_test_types.hpp" #include "test_utils_data_generation.hpp" @@ -30,6 +31,7 @@ #include "rocprim/device/device_copy.hpp" #include "rocprim/device/device_memcpy.hpp" #include "rocprim/intrinsics/thread.hpp" +#include "rocprim/iterator.hpp" #include #include @@ -39,51 +41,64 @@ #include #include +#include #include #include template + bool IsMemCpy, + bool Shuffled = false, + unsigned int NumBuffers = 1024, + unsigned int MaxSize = 4 * 1024, + bool UseIndirectIterator = false> struct DeviceBatchMemcpyParams { - using value_type = ValueType; - using size_type = SizeType; - static constexpr bool isMemCpy = IsMemCpy; - static constexpr bool shuffled = Shuffled; - static constexpr uint32_t num_buffers = NumBuffers; - static constexpr uint32_t max_size = MaxSize; + using value_type = ValueType; + using size_type = SizeType; + static constexpr bool isMemCpy = IsMemCpy; + static constexpr bool shuffled = Shuffled; + static constexpr unsigned int num_buffers = NumBuffers; + static constexpr unsigned int max_size = MaxSize; + static constexpr bool use_indirect_iterator = UseIndirectIterator; }; template struct RocprimDeviceBatchMemcpyTests : public ::testing::Test { - using value_type = typename Params::value_type; - using size_type = typename Params::size_type; - static constexpr bool isMemCpy = Params::isMemCpy; - static constexpr bool shuffled = Params::shuffled; - static constexpr uint32_t num_buffers = Params::num_buffers; - static constexpr uint32_t max_size = Params::max_size; + using value_type = typename Params::value_type; + using size_type = typename Params::size_type; + static constexpr bool isMemCpy = Params::isMemCpy; + static constexpr bool shuffled = Params::shuffled; + static constexpr unsigned int num_buffers = Params::num_buffers; + static constexpr unsigned int max_size = Params::max_size; + static constexpr bool use_indirect_iterator = Params::use_indirect_iterator; }; typedef ::testing::Types< // Ignore copy/move - DeviceBatchMemcpyParams, uint32_t, true, false>, - DeviceBatchMemcpyParams, uint32_t, true, false>, - DeviceBatchMemcpyParams, uint32_t, true, false>, + DeviceBatchMemcpyParams, + unsigned int, + true, + false>, + DeviceBatchMemcpyParams, + unsigned int, + true, + false>, + DeviceBatchMemcpyParams, + unsigned int, + true, + false>, // Unshuffled inputs and outputs // Variable value_type - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, // size_type: uint16_t DeviceBatchMemcpyParams, // size_type: int64_t @@ -91,23 +106,26 @@ typedef ::testing::Types< DeviceBatchMemcpyParams, // weird amount of buffers - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, // Shuffled inputs and outputs // Variable value_type - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, // size_type: uint16_t DeviceBatchMemcpyParams, // size_type: int64_t DeviceBatchMemcpyParams, - DeviceBatchMemcpyParams> + DeviceBatchMemcpyParams, + + // Test iterator input for BatchCopy + DeviceBatchMemcpyParams> RocprimDeviceBatchMemcpyTestsParams; TYPED_TEST_SUITE(RocprimDeviceBatchMemcpyTests, RocprimDeviceBatchMemcpyTestsParams); @@ -214,7 +232,7 @@ void batch_copy(void* temporary_storage, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, - uint32_t num_copies, + unsigned int num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_memcpy(temporary_storage, @@ -236,7 +254,7 @@ void batch_copy(void* temporary_storage, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, - uint32_t num_copies, + unsigned int num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_copy(temporary_storage, @@ -261,7 +279,7 @@ void check_result(ContainerMemCpy& h_input_for_memcpy, ptr d_output, byte_offset_type total_num_bytes, byte_offset_type /*total_num_elements*/, - int32_t num_buffers, + int num_buffers, OffsetContainer& src_offsets, OffsetContainer& dst_offsets, SizesContainer& h_buffer_num_bytes) @@ -269,7 +287,7 @@ void check_result(ContainerMemCpy& h_input_for_memcpy, using value_type = typename ContainerCopy::value_type; std::vector h_output = std::vector(total_num_bytes); HIP_CHECK(hipMemcpy(h_output.data(), d_output, total_num_bytes, hipMemcpyDeviceToHost)); - for(int32_t i = 0; i < num_buffers; ++i) + for(int i = 0; i < num_buffers; ++i) { ASSERT_EQ(std::memcmp(h_input_for_memcpy.data() + src_offsets[i] * sizeof(value_type), h_output.data() + dst_offsets[i] * sizeof(value_type), @@ -292,7 +310,7 @@ void check_result(ContainerMemCpy& /*h_input_for_memcpy*/, ptr d_output, byte_offset_type total_num_bytes, byte_offset_type total_num_elements, - int32_t num_buffers, + int num_buffers, OffsetContainer& src_offsets, OffsetContainer& dst_offsets, SizesContainer& h_buffer_num_bytes) @@ -300,7 +318,7 @@ void check_result(ContainerMemCpy& /*h_input_for_memcpy*/, using value_type = typename ContainerCopy::value_type; std::vector h_output = std::vector(total_num_elements); HIP_CHECK(hipMemcpy(h_output.data(), d_output, total_num_bytes, hipMemcpyDeviceToHost)); - for(int32_t i = 0; i < num_buffers; ++i) + for(int i = 0; i < num_buffers; ++i) { ASSERT_EQ(std::memcmp(h_input_for_copy.data() + src_offsets[i], h_output.data() + dst_offsets[i], @@ -314,31 +332,30 @@ TYPED_TEST(RocprimDeviceBatchMemcpyTests, SizeAndTypeVariation) { using value_type = typename TestFixture::value_type; using buffer_size_type = typename TestFixture::size_type; - using buffer_offset_type = uint32_t; + using buffer_offset_type = unsigned int; using byte_offset_type = size_t; - constexpr int32_t num_buffers = TestFixture::num_buffers; - constexpr int32_t max_size = TestFixture::max_size; - constexpr bool shuffled = TestFixture::shuffled; - constexpr bool isMemCpy = TestFixture::isMemCpy; + constexpr int num_buffers = TestFixture::num_buffers; + constexpr int max_size = TestFixture::max_size; + constexpr bool shuffled = TestFixture::shuffled; + constexpr bool isMemCpy = TestFixture::isMemCpy; + constexpr bool use_indirect_iterator = TestFixture::use_indirect_iterator; - constexpr int32_t wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold; - constexpr int32_t blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold; + constexpr int wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold; + constexpr int blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold; - constexpr int32_t wlev_min_elems - = rocprim::detail::ceiling_div(wlev_min_size, sizeof(value_type)); - constexpr int32_t blev_min_elems - = rocprim::detail::ceiling_div(blev_min_size, sizeof(value_type)); - constexpr int32_t max_elems = max_size / sizeof(value_type); + constexpr int wlev_min_elems = rocprim::detail::ceiling_div(wlev_min_size, sizeof(value_type)); + constexpr int blev_min_elems = rocprim::detail::ceiling_div(blev_min_size, sizeof(value_type)); + constexpr int max_elems = max_size / sizeof(value_type); - constexpr int32_t enabled_size_categories + constexpr int enabled_size_categories = (blev_min_elems <= max_elems) + (wlev_min_elems <= max_elems) + 1; - constexpr int32_t num_blev + constexpr int num_blev = blev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0; - constexpr int32_t num_wlev + constexpr int num_wlev = wlev_min_elems <= max_elems ? num_buffers / enabled_size_categories : 0; - constexpr int32_t num_tlev = num_buffers - num_blev - num_wlev; + constexpr int num_tlev = num_buffers - num_blev - num_wlev; // Get random buffer sizes for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; ++seed_index) @@ -448,7 +465,7 @@ TYPED_TEST(RocprimDeviceBatchMemcpyTests, SizeAndTypeVariation) std::vector h_buffer_srcs(num_buffers); std::vector h_buffer_dsts(num_buffers); - for(int32_t i = 0; i < num_buffers; ++i) + for(int i = 0; i < num_buffers; ++i) { h_buffer_srcs[i] = d_input + src_offsets[i]; h_buffer_dsts[i] = d_output + dst_offsets[i]; @@ -487,11 +504,16 @@ TYPED_TEST(RocprimDeviceBatchMemcpyTests, SizeAndTypeVariation) h_buffer_dsts.size() * sizeof(*d_buffer_dsts), hipMemcpyHostToDevice)); + const auto input_src_it + = test_utils::wrap_in_indirect_iterator(d_buffer_srcs); + const auto output_src_it + = test_utils::wrap_in_indirect_iterator(d_buffer_dsts); + // Run batched memcpy. batch_copy(d_temp_storage, temp_storage_bytes, - d_buffer_srcs, - d_buffer_dsts, + input_src_it, + output_src_it, d_buffer_sizes, num_buffers, hipStreamDefault); @@ -515,3 +537,114 @@ TYPED_TEST(RocprimDeviceBatchMemcpyTests, SizeAndTypeVariation) HIP_CHECK(hipFree(d_input)); } } + +struct GetIteratorToRange +{ + __host__ __device__ __forceinline__ + auto operator()(unsigned int index) const + { + return rocprim::make_constant_iterator(d_data_in[index]); + } + unsigned int* d_data_in; +}; + +struct GetPtrToRange +{ + __host__ __device__ __forceinline__ + auto operator()(unsigned int index) const + { + return d_data_out + d_offsets[index]; + } + unsigned int* d_data_out; + unsigned int* d_offsets; +}; + +struct GetRunLength +{ + __host__ __device__ __forceinline__ + unsigned int + operator()(unsigned int index) const + { + return d_offsets[index + 1] - d_offsets[index]; + } + unsigned int* d_offsets; +}; + +TEST(RocprimDeviceBatchMemcpyTests, IteratorTest) +{ + // Create the data and copy it to the device. + const unsigned int num_ranges = 5; + const unsigned int num_outputs = 14; + + std::vector h_data_in = {4, 2, 7, 3, 1}; // size should be num_ranges + std::vector h_data_out(num_outputs, 0); // size should be num_outputs + std::vector h_offsets + = {0, 2, 5, 6, 9, 14}; // max value should be num_outputs, size should be (num_ranges + 1) + + unsigned int* d_data_in; // [4, 2, 7, 3, 1] + unsigned int* d_data_out; // [0, ... ] + unsigned int* d_offsets; // [0, 2, 5, 6, 9, 14] + + HIP_CHECK(hipMalloc(&d_data_in, sizeof(unsigned int) * num_ranges)); + HIP_CHECK(hipMalloc(&d_data_out, sizeof(unsigned int) * num_outputs)); + HIP_CHECK(hipMalloc(&d_offsets, sizeof(unsigned int) * (num_ranges + 1))); + + HIP_CHECK(hipMemcpy(d_data_in, + h_data_in.data(), + sizeof(unsigned int) * num_ranges, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_data_out, + h_data_out.data(), + sizeof(unsigned int) * num_outputs, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_offsets, + h_offsets.data(), + sizeof(unsigned int) * (num_ranges + 1), + hipMemcpyHostToDevice)); + + // Returns a constant iterator to the element of the i-th run + rocprim::counting_iterator iota(0); + auto iterators_in = rocprim::make_transform_iterator(iota, GetIteratorToRange{d_data_in}); + + // Returns the run length of the i-th run + auto sizes = rocprim::make_transform_iterator(iota, GetRunLength{d_offsets}); + + // Returns pointers to the output range for each run + auto ptrs_out = rocprim::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets}); + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + batch_copy(d_temp_storage, + temp_storage_bytes, + iterators_in, + ptrs_out, + sizes, + num_ranges, + 0); + + // Allocate temporary storage + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Run batched copy algorithm (used to perform runlength decoding) + batch_copy(d_temp_storage, + temp_storage_bytes, + iterators_in, + ptrs_out, + sizes, + num_ranges, + 0); + + // Copy results back to host and print + HIP_CHECK( + hipMemcpy(h_data_out.data(), d_data_out, sizeof(int) * num_outputs, hipMemcpyDeviceToHost)); + + std::vector expected = {4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1}; + test_utils::assert_eq(expected, h_data_out); + + // Clean up + HIP_CHECK(hipFree(d_temp_storage)); + HIP_CHECK(hipFree(d_data_in)); + HIP_CHECK(hipFree(d_data_out)); + HIP_CHECK(hipFree(d_offsets)); +} diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp index 37afd47f8..926353a63 100644 --- a/test/rocprim/test_device_histogram.cpp +++ b/test/rocprim/test_device_histogram.cpp @@ -132,6 +132,7 @@ using custom_config1 = rocprim::histogram_config> typedef ::testing::Types, params1, + params1, params1, params1, params1, @@ -140,8 +141,8 @@ typedef ::testing::Types, params1, params1, params1, - params1, + params1, params1, params1, params1>