From 4a50de6b58271cddb82e08743a66e0cfb2c16d40 Mon Sep 17 00:00:00 2001 From: avinashkethineedi Date: Thu, 30 Jan 2025 23:09:32 +0000 Subject: [PATCH] Update bandwidth and latency calculations, add multi work group support - Refined bandwidth and latency calculations for improved accuracy - Added multi work group support for functional tests --- tests/functional_tests/alltoall_tester.cpp | 130 ++++++--- tests/functional_tests/alltoall_tester.hpp | 23 +- tests/functional_tests/amo_bitwise_tester.cpp | 21 +- .../functional_tests/amo_extended_tester.cpp | 67 ++--- .../functional_tests/amo_extended_tester.hpp | 2 +- .../functional_tests/amo_standard_tester.cpp | 75 ++--- .../functional_tests/amo_standard_tester.hpp | 2 +- tests/functional_tests/barrier_all_tester.cpp | 15 +- tests/functional_tests/barrier_all_tester.hpp | 1 + tests/functional_tests/empty_tester.cpp | 10 +- .../functional_tests/extended_primitives.cpp | 31 +-- .../functional_tests/extended_primitives.hpp | 1 - tests/functional_tests/fcollect_tester.cpp | 144 +++++++--- tests/functional_tests/fcollect_tester.hpp | 18 +- tests/functional_tests/ping_all_tester.cpp | 14 +- tests/functional_tests/ping_all_tester.hpp | 3 +- tests/functional_tests/ping_pong_tester.cpp | 12 +- tests/functional_tests/ping_pong_tester.hpp | 3 +- .../functional_tests/primitive_mr_tester.cpp | 22 +- tests/functional_tests/primitive_tester.cpp | 38 ++- tests/functional_tests/shmem_ptr_tester.cpp | 2 + .../signaling_operations_tester.cpp | 196 ++++++++++--- .../signaling_operations_tester.hpp | 7 +- tests/functional_tests/sync_tester.cpp | 78 ++++-- tests/functional_tests/sync_tester.hpp | 15 + .../team_broadcast_tester.cpp | 150 +++++++--- .../team_broadcast_tester.hpp | 16 +- .../team_ctx_infra_tester.cpp | 29 +- .../team_ctx_infra_tester.hpp | 11 + .../team_ctx_primitive_tester.cpp | 122 +++++---- .../team_ctx_primitive_tester.hpp | 11 + .../team_reduction_tester.cpp | 56 ++-- .../team_reduction_tester.hpp | 9 + tests/functional_tests/tester.cpp | 259 ++++++------------ tests/functional_tests/tester.hpp | 12 +- tests/functional_tests/tester_arguments.cpp | 6 + .../wave_level_primitives.cpp | 30 +- .../wave_level_primitives.hpp | 1 - 38 files changed, 1032 insertions(+), 610 deletions(-) diff --git a/tests/functional_tests/alltoall_tester.cpp b/tests/functional_tests/alltoall_tester.cpp index ffd543e..b6318a5 100644 --- a/tests/functional_tests/alltoall_tester.cpp +++ b/tests/functional_tests/alltoall_tester.cpp @@ -20,8 +20,6 @@ * IN THE SOFTWARE. *****************************************************************************/ -using namespace rocshmem; - /* Declare the template with a generic implementation */ template __device__ void wg_alltoall(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, @@ -52,30 +50,32 @@ ALLTOALL_DEF_GEN(unsigned int, uint) ALLTOALL_DEF_GEN(unsigned long, ulong) ALLTOALL_DEF_GEN(unsigned long long, ulonglong) -rocshmem_team_t team_alltoall_world_dup; - /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ template -__global__ void AlltoallTest(int loop, int skip, uint64_t *timer, - T1 *source_buf, T1 *dest_buf, int size, - ShmemContextType ctx_type, rocshmem_team_t team) { +__global__ void AlltoallTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, T1 *source_buf, T1 *dest_buf, + int size, ShmemContextType ctx_type, + rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); int n_pes = rocshmem_ctx_n_pes(ctx); + source_buf += wg_id * n_pes * size; + dest_buf += wg_id * n_pes * size; + __syncthreads(); - uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } - wg_alltoall(ctx, team, + wg_alltoall(ctx, teams[wg_id], dest_buf, // T* dest source_buf, // const T* source size); // int nelement @@ -84,7 +84,7 @@ __global__ void AlltoallTest(int loop, int skip, uint64_t *timer, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -95,29 +95,46 @@ __global__ void AlltoallTest(int loop, int skip, uint64_t *timer, * HOST TESTER CLASS METHODS *****************************************************************************/ template -AlltoallTester::AlltoallTester( - TesterArguments args, std::function f1, - std::function(const T1 &, T1)> f2) - : Tester(args), init_buf{f1}, verify_buf{f2} { - int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); - dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); +AlltoallTester::AlltoallTester(TesterArguments args) + : Tester(args){ + my_pe = rocshmem_team_my_pe(ROCSHMEM_TEAM_WORLD); + n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + + int num_elems = (args.max_msg_size / sizeof(T1)) * args.num_wgs * n_pes; + int buff_size = num_elems * sizeof(T1); + + source_buf = (T1 *)rocshmem_malloc(buff_size); + dest_buf = (T1 *)rocshmem_malloc(buff_size); + + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_alltoall_world_dup, + sizeof(rocshmem_team_t) * num_teams)); } template AlltoallTester::~AlltoallTester() { rocshmem_free(source_buf); rocshmem_free(dest_buf); + CHECK_HIP(hipFree(team_alltoall_world_dup)); } template void AlltoallTester::preLaunchKernel() { - int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - bw_factor = sizeof(T1) * n_pes; - - team_alltoall_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_alltoall_world_dup); + bw_factor = n_pes; + + for (int team_i = 0; team_i < num_teams; team_i++) { + team_alltoall_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_alltoall_world_dup[team_i]); + if (team_alltoall_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + std::cout << "Team " << team_i << " is invalid!" << std::endl; + abort(); + } + } } template @@ -125,39 +142,68 @@ void AlltoallTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; + int num_elems = size / sizeof(T1); + hipLaunchKernelGGL(AlltoallTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, source_buf, dest_buf, size, - _shmem_context, team_alltoall_world_dup); + stream, loop, args.skip, start_time, end_time, + source_buf, dest_buf, num_elems, _shmem_context, + team_alltoall_world_dup); - num_msgs = loop + args.skip; - num_timed_msgs = loop; + num_msgs = (loop + args.skip) * gridSize.x; + num_timed_msgs = loop * gridSize.x; } template void AlltoallTester::postLaunchKernel() { - rocshmem_team_destroy(team_alltoall_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_alltoall_world_dup[team_i]); + } } template void AlltoallTester::resetBuffers(uint64_t size) { - int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - for (int i = 0; i < n_pes; i++) { - for (uint64_t j = 0; j < size; j++) { - init_buf(source_buf[i * size + j], dest_buf[i * size + j], (T1)i); + + int num_elems = size / sizeof(T1); + int buff_size = num_elems * sizeof(T1) * args.num_wgs * n_pes; + int idx = 0; + + for(int wg_id = 0; wg_id < args.num_wgs; wg_id++) { + for(int pe = 0; pe < n_pes; pe++) { + for(int i = 0; i < num_elems; i++) { + idx = (wg_id * n_pes + pe) * num_elems + i; + if constexpr (std::is_same::value || + std::is_same::value || + std::is_same::value) { + source_buf[idx] = static_cast('a' + my_pe + pe + wg_id); + } + else if constexpr (std::is_floating_point::value) { + source_buf[idx] = static_cast(3.14 + my_pe + pe + wg_id); + } + else if constexpr (std::is_integral::value) { + source_buf[idx] = static_cast(my_pe + pe + wg_id); + } + } } } + + memset(dest_buf, -1, buff_size); } template void AlltoallTester::verifyResults(uint64_t size) { - int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - for (int i = 0; i < n_pes; i++) { - for (uint64_t j = 0; j < size; j++) { - auto r = verify_buf(dest_buf[i * size + j], i); - if (r.first == false) { - fprintf(stderr, "Data validation error at idx %lu\n", j); - fprintf(stderr, "%s.\n", r.second.c_str()); - exit(-1); + int num_elems = size / sizeof(T1); + int idx = 0; + + for(int wg_id = 0; wg_id < args.num_wgs; wg_id++) { + for(int pe = 0; pe < n_pes; pe++) { + for(int i = 0; i < num_elems; i++) { + idx = (wg_id * n_pes + pe) * num_elems + i; + if (dest_buf[idx] != source_buf[idx]) { + std::cerr << "Data validation error at idx " << idx << std::endl; + std::cerr << "PE " << my_pe << " Got " << dest_buf[idx] + << ", Expected " << source_buf[idx] << std::endl; + exit(-1); + } } } } diff --git a/tests/functional_tests/alltoall_tester.hpp b/tests/functional_tests/alltoall_tester.hpp index 2e08b3b..5dec092 100644 --- a/tests/functional_tests/alltoall_tester.hpp +++ b/tests/functional_tests/alltoall_tester.hpp @@ -28,15 +28,15 @@ #include "tester.hpp" +using namespace rocshmem; + /************* ***************************************************************** * HOST TESTER CLASS *****************************************************************************/ template class AlltoallTester : public Tester { public: - explicit AlltoallTester( - TesterArguments args, std::function f1, - std::function(const T1 &, T1)> f2); + explicit AlltoallTester(TesterArguments args); virtual ~AlltoallTester(); protected: @@ -51,12 +51,19 @@ class AlltoallTester : public Tester { virtual void verifyResults(uint64_t size) override; - T1 *source_buf; - T1 *dest_buf; + T1 *source_buf = nullptr; + T1 *dest_buf = nullptr; + +private: + int my_pe = 0; + int n_pes = 0; - private: - std::function init_buf; - std::function(const T1 &, T1)> verify_buf; + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_alltoall_world_dup; }; #include "alltoall_tester.cpp" diff --git a/tests/functional_tests/amo_bitwise_tester.cpp b/tests/functional_tests/amo_bitwise_tester.cpp index 426fa22..31df49b 100644 --- a/tests/functional_tests/amo_bitwise_tester.cpp +++ b/tests/functional_tests/amo_bitwise_tester.cpp @@ -29,8 +29,9 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template -__global__ void AMOBitwiseTest(int loop, int skip, uint64_t *timer, char *r_buf, - T *s_buf, T *ret_val, TestType type, +__global__ void AMOBitwiseTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *r_buf, T *s_buf, + T *ret_val, TestType type, ShmemContextType ctx_type) { return; } @@ -64,8 +65,8 @@ void AMOBitwiseTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOBitwiseTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, timer, _r_buf, _s_buf, _ret_val, _type, - _shmem_context); + loop, args.skip, start_time, end_time, _r_buf, _s_buf, + _ret_val, _type, _shmem_context); _gridSize = gridsize; num_msgs = (loop + args.skip) * gridsize.x; @@ -123,17 +124,19 @@ void AMOBitwiseTester::verifyResults(uint64_t size) { #define AMO_BITWISE_DEF_GEN(T, TNAME) \ template <> \ __global__ void AMOBitwiseTest( \ - int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + int loop, int skip, uint64_t *start_time, uint64_t *end_time, \ + char *r_buf, T *s_buf, T *ret_val, TestType type, \ + ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ if (hipThreadIdx_x == 0) { \ - uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = rocshmem_timer(); \ + if (i == skip) { \ + start_time[hipBlockIdx_x] = wall_clock64(); \ + } \ switch (type) { \ case AMO_FetchAndTestType: \ ret = rocshmem_ctx_##TNAME##_atomic_fetch_and(ctx, (T *)r_buf, \ @@ -161,7 +164,7 @@ void AMOBitwiseTester::verifyResults(uint64_t size) { } \ } \ rocshmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = rocshmem_timer() - start; \ + end_time[hipBlockIdx_x] = wall_clock64(); \ ret_val[hipBlockIdx_x] = ret; \ rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ } \ diff --git a/tests/functional_tests/amo_extended_tester.cpp b/tests/functional_tests/amo_extended_tester.cpp index 44b3f08..ddbb5f0 100644 --- a/tests/functional_tests/amo_extended_tester.cpp +++ b/tests/functional_tests/amo_extended_tester.cpp @@ -29,9 +29,10 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template -__global__ void AMOExtendedTest(int loop, int skip, uint64_t *timer, - char *r_buf, T *s_buf, T *ret_val, - TestType type, ShmemContextType ctx_type) { +__global__ void AMOExtendedTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *r_buf, T *s_buf, + T *ret_val, TestType type, + ShmemContextType ctx_type) { return; } @@ -40,22 +41,23 @@ __global__ void AMOExtendedTest(int loop, int skip, uint64_t *timer, *****************************************************************************/ template AMOExtendedTester::AMOExtendedTester(TesterArguments args) : Tester(args) { - CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)rocshmem_malloc(args.max_msg_size); - _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); + CHECK_HIP(hipMalloc((void **)&_ret_val, sizeof(T) * args.num_wgs)); + _r_buf = (char *)rocshmem_malloc(sizeof(T) * args.num_wgs); + _s_buf = (T *)rocshmem_malloc(sizeof(T) * args.num_wgs); } template AMOExtendedTester::~AMOExtendedTester() { + rocshmem_free(_s_buf); rocshmem_free(_r_buf); CHECK_HIP(hipFree(_ret_val)); } template void AMOExtendedTester::resetBuffers(uint64_t size) { - memset(_r_buf, 0, args.max_msg_size); - memset(_ret_val, 0, args.max_msg_size * args.num_wgs); - memset(_s_buf, 0, args.max_msg_size * args.num_wgs); + memset(_r_buf, 0, sizeof(T) * args.num_wgs); + memset(_ret_val, 0, sizeof(T) * args.num_wgs); + memset(_s_buf, 0, sizeof(T) * args.num_wgs); } template @@ -64,17 +66,18 @@ void AMOExtendedTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOExtendedTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, timer, _r_buf, _s_buf, _ret_val, _type, - _shmem_context); + loop, args.skip, start_time, end_time, _r_buf, _s_buf, + _ret_val, _type, _shmem_context); - _gridSize = gridsize; num_msgs = (loop + args.skip) * gridsize.x; - num_timed_msgs = loop; + num_timed_msgs = loop * gridsize.x; + + total_msgs = loop + args.skip; } template void AMOExtendedTester::verifyResults(uint64_t size) { - T ret; + T *res; if (args.myid == 0) { T expected_val = 0; @@ -86,7 +89,7 @@ void AMOExtendedTester::verifyResults(uint64_t size) { expected_val = 44; break; case AMO_SwapTestType: - expected_val = num_msgs / 2; + expected_val = total_msgs / 2; break; default: break; @@ -95,15 +98,13 @@ void AMOExtendedTester::verifyResults(uint64_t size) { int fetch_op = (_type == AMO_FetchTestType || _type == AMO_SwapTestType) ? 1 : 0; - if (fetch_op == 1) { - ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); - } else { - ret = *std::max_element(_s_buf, _s_buf + args.num_wgs); - } - if (ret != expected_val) { - std::cerr << "data validation error\n"; - std::cerr << "got " << ret << ", expected " << expected_val << std::endl; - exit(-1); + res = (fetch_op == 1) ? _ret_val : _s_buf; + for (int i = 0; i < args.num_wgs; i++) { + if (res[i] != expected_val) { + std::cerr << "data validation error\n"; + std::cerr << "got " << res[i] << ", expected " << expected_val << std::endl; + exit(-1); + } } } } @@ -111,17 +112,21 @@ void AMOExtendedTester::verifyResults(uint64_t size) { #define AMO_EXTENDED_DEF_GEN(T, TNAME) \ template <> \ __global__ void AMOExtendedTest( \ - int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + int loop, int skip, uint64_t *start_time, uint64_t *end_time, \ + char *r_buf, T *s_buf, T *ret_val, TestType type, \ + ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ + int wg_id = get_flat_grid_id(); \ + r_buf = (char *)((T *)r_buf + wg_id); \ if (hipThreadIdx_x == 0) { \ - uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = rocshmem_timer(); \ + if (i == skip) { \ + start_time[wg_id] = wall_clock64(); \ + } \ switch (type) { \ case AMO_FetchTestType: \ ret = rocshmem_ctx_##TNAME##_atomic_fetch(ctx, (T *)r_buf, 1); \ @@ -138,9 +143,9 @@ void AMOExtendedTester::verifyResults(uint64_t size) { } \ } \ rocshmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = rocshmem_timer() - start; \ - ret_val[hipBlockIdx_x] = ret; \ - rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ + end_time[wg_id] = wall_clock64(); \ + ret_val[wg_id] = ret; \ + rocshmem_ctx_getmem(ctx, &s_buf[wg_id], r_buf, sizeof(T), 1); \ } \ rocshmem_wg_ctx_destroy(&ctx); \ rocshmem_wg_finalize(); \ diff --git a/tests/functional_tests/amo_extended_tester.hpp b/tests/functional_tests/amo_extended_tester.hpp index 20ea489..a0fb2b7 100644 --- a/tests/functional_tests/amo_extended_tester.hpp +++ b/tests/functional_tests/amo_extended_tester.hpp @@ -42,10 +42,10 @@ class AMOExtendedTester : public Tester { virtual void verifyResults(uint64_t size) override; - dim3 _gridSize{}; char *_r_buf; T *_ret_val; T *_s_buf; + int total_msgs = 0; }; #endif diff --git a/tests/functional_tests/amo_standard_tester.cpp b/tests/functional_tests/amo_standard_tester.cpp index 93009b0..aa7ac1c 100644 --- a/tests/functional_tests/amo_standard_tester.cpp +++ b/tests/functional_tests/amo_standard_tester.cpp @@ -29,9 +29,10 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template -__global__ void AMOStandardTest(int loop, int skip, uint64_t *timer, - char *r_buf, T *s_buf, T *ret_val, - TestType type, ShmemContextType ctx_type) { +__global__ void AMOStandardTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *r_buf, T *s_buf, + T *ret_val, TestType type, + ShmemContextType ctx_type) { return; } @@ -40,22 +41,23 @@ __global__ void AMOStandardTest(int loop, int skip, uint64_t *timer, *****************************************************************************/ template AMOStandardTester::AMOStandardTester(TesterArguments args) : Tester(args) { - CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)rocshmem_malloc(args.max_msg_size); - _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); + CHECK_HIP(hipMalloc((void **)&_ret_val, sizeof(uint64_t) * args.num_wgs)); + _r_buf = (char *)rocshmem_malloc(sizeof(uint64_t) * args.num_wgs); + _s_buf = (T *)rocshmem_malloc(sizeof(uint64_t) * args.num_wgs); } template AMOStandardTester::~AMOStandardTester() { + rocshmem_free(_s_buf); rocshmem_free(_r_buf); CHECK_HIP(hipFree(_ret_val)); } template void AMOStandardTester::resetBuffers(uint64_t size) { - memset(_r_buf, 0, args.max_msg_size); - memset(_ret_val, 0, args.max_msg_size * args.num_wgs); - memset(_s_buf, 0, args.max_msg_size * args.num_wgs); + memset(_r_buf, 0, sizeof(uint64_t) * args.num_wgs); + memset(_ret_val, 0, sizeof(uint64_t) * args.num_wgs); + memset(_s_buf, 0, sizeof(uint64_t) * args.num_wgs); } template @@ -64,35 +66,36 @@ void AMOStandardTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOStandardTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, timer, _r_buf, _s_buf, _ret_val, _type, - _shmem_context); + loop, args.skip, start_time, end_time, _r_buf, _s_buf, + _ret_val, _type, _shmem_context); - _gridSize = gridsize; num_msgs = (loop + args.skip) * gridsize.x; - num_timed_msgs = loop; + num_timed_msgs = loop * gridsize.x; + + total_msgs = loop + args.skip; } template void AMOStandardTester::verifyResults(uint64_t size) { - T ret; + T *res; if (args.myid == 0) { T expected_val = 0; switch (_type) { case AMO_FAddTestType: - expected_val = 2 * (num_msgs - 1); + expected_val = 2 * (total_msgs - 1); break; case AMO_FIncTestType: - expected_val = num_msgs - 1; + expected_val = total_msgs - 1; break; case AMO_AddTestType: - expected_val = 2 * num_msgs; + expected_val = 2 * total_msgs; break; case AMO_IncTestType: - expected_val = num_msgs; + expected_val = total_msgs; break; case AMO_FCswapTestType: - expected_val = (num_msgs - 2) / _gridSize.x; + expected_val = (total_msgs - 2); break; default: break; @@ -103,15 +106,13 @@ void AMOStandardTester::verifyResults(uint64_t size) { ? 1 : 0; - if (fetch_op == 1) { - ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); - } else { - ret = *std::max_element(_s_buf, _s_buf + args.num_wgs); - } - if (ret != expected_val) { - std::cerr << "data validation error\n"; - std::cerr << "got " << ret << ", expected " << expected_val << std::endl; - exit(-1); + res = (fetch_op == 1) ? _ret_val : _s_buf; + for (int i = 0; i < args.num_wgs; i++) { + if (res[i] != expected_val) { + std::cerr << "data validation error\n"; + std::cerr << "got " << res[i] << ", expected " << expected_val << std::endl; + exit(-1); + } } } } @@ -119,17 +120,21 @@ void AMOStandardTester::verifyResults(uint64_t size) { #define AMO_STANDARD_DEF_GEN(T, TNAME) \ template <> \ __global__ void AMOStandardTest( \ - int loop, int skip, uint64_t *timer, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + int loop, int skip, uint64_t *start_time, uint64_t *end_time, \ + char *r_buf, T *s_buf, T *ret_val, TestType type, \ + ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ + int wg_id = get_flat_grid_id(); \ + r_buf = (char *)((T *)r_buf + wg_id); \ if (hipThreadIdx_x == 0) { \ - uint64_t start; \ T ret = 0; \ T cond = 0; \ for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) start = rocshmem_timer(); \ + if (i == skip) { \ + start_time[wg_id] = wall_clock64(); \ + } \ switch (type) { \ case AMO_FAddTestType: \ ret = rocshmem_ctx_##TNAME##_atomic_fetch_add(ctx, (T *)r_buf, 2, \ @@ -155,9 +160,9 @@ void AMOStandardTester::verifyResults(uint64_t size) { } \ } \ rocshmem_ctx_quiet(ctx); \ - timer[hipBlockIdx_x] = rocshmem_timer() - start; \ - ret_val[hipBlockIdx_x] = ret; \ - rocshmem_ctx_getmem(ctx, &s_buf[hipBlockIdx_x], r_buf, sizeof(T), 1); \ + end_time[wg_id] = wall_clock64(); \ + ret_val[wg_id] = ret; \ + rocshmem_ctx_getmem(ctx, &s_buf[wg_id], r_buf, sizeof(T), 1); \ } \ rocshmem_wg_ctx_destroy(&ctx); \ rocshmem_wg_finalize(); \ diff --git a/tests/functional_tests/amo_standard_tester.hpp b/tests/functional_tests/amo_standard_tester.hpp index 61c3da8..33e1c18 100644 --- a/tests/functional_tests/amo_standard_tester.hpp +++ b/tests/functional_tests/amo_standard_tester.hpp @@ -42,10 +42,10 @@ class AMOStandardTester : public Tester { virtual void verifyResults(uint64_t size) override; - dim3 _gridSize{}; char *_r_buf; T *_ret_val; T *_s_buf; + int total_msgs = 0; }; #endif diff --git a/tests/functional_tests/barrier_all_tester.cpp b/tests/functional_tests/barrier_all_tester.cpp index 54faaa9..151d118 100644 --- a/tests/functional_tests/barrier_all_tester.cpp +++ b/tests/functional_tests/barrier_all_tester.cpp @@ -29,26 +29,27 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void BarrierAllTest(int loop, int skip, uint64_t *timer) { +__global__ void BarrierAllTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ROCSHMEM_CTX_WG_PRIVATE, &ctx); - uint64_t start; for (int i = 0; i < loop + skip; i++) { if (hipThreadIdx_x == 0 && i == skip) { - start = rocshmem_timer(); + start_time[hipBlockIdx_x] = wall_clock64(); } __syncthreads(); - rocshmem_ctx_wg_barrier_all(ctx); + if (is_block_zero_in_grid()) + rocshmem_ctx_wg_barrier_all(ctx); } __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[hipBlockIdx_x] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -67,9 +68,9 @@ void BarrierAllTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(BarrierAllTest, gridSize, blockSize, shared_bytes, stream, - loop, args.skip, timer); + loop, args.skip, start_time, end_time); - num_msgs = (loop + args.skip) * gridSize.x; + num_msgs = loop + args.skip; num_timed_msgs = loop; } diff --git a/tests/functional_tests/barrier_all_tester.hpp b/tests/functional_tests/barrier_all_tester.hpp index da864b7..1ddd511 100644 --- a/tests/functional_tests/barrier_all_tester.hpp +++ b/tests/functional_tests/barrier_all_tester.hpp @@ -24,6 +24,7 @@ #define _BARRIER_ALL_TESTER_HPP_ #include "tester.hpp" +#include "../src/util.hpp" /****************************************************************************** * DEVICE TEST KERNEL diff --git a/tests/functional_tests/empty_tester.cpp b/tests/functional_tests/empty_tester.cpp index 5bc9fac..6ffd5ae 100644 --- a/tests/functional_tests/empty_tester.cpp +++ b/tests/functional_tests/empty_tester.cpp @@ -29,8 +29,9 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void EmptyTest(int loop, int skip, uint64_t *timer, int size, - TestType type, ShmemContextType ctx_type) { +__global__ void EmptyTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, int size, TestType type, + ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); @@ -52,8 +53,9 @@ void EmptyTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; - hipLaunchKernelGGL(EmptyTest, gridSize, blockSize, shared_bytes, stream, loop, - args.skip, timer, size, _type, _shmem_context); + hipLaunchKernelGGL(EmptyTest, gridSize, blockSize, shared_bytes, stream, + loop, args.skip, start_time, end_time, size, _type, + _shmem_context); } void EmptyTester::verifyResults(uint64_t size) {} diff --git a/tests/functional_tests/extended_primitives.cpp b/tests/functional_tests/extended_primitives.cpp index d3a4a7b..bf57f99 100644 --- a/tests/functional_tests/extended_primitives.cpp +++ b/tests/functional_tests/extended_primitives.cpp @@ -31,26 +31,25 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *timer, - char *s_buf, char *r_buf, int size, - TestType type, +__global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *s_buf, + char *r_buf, int size, TestType type, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); - /** - * Calculate start index for each work group for tiled version - * If the number of work groups is greater than 1, this kernel performs a - * tiled functional test - */ - uint64_t start; - uint64_t idx = size * get_flat_grid_id(); - s_buf += idx; - r_buf += idx; + + // Calculate start index for each work group + int wg_id = get_flat_grid_id(); + uint64_t offset = size * wg_id; + s_buf += offset; + r_buf += offset; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = rocshmem_timer(); + if (i == skip) { + start_time[wg_id] = wall_clock64(); + } switch (type) { case WGGetTestType: @@ -73,7 +72,7 @@ __global__ void ExtendedPrimitiveTest(int loop, int skip, uint64_t *timer, rocshmem_ctx_quiet(ctx); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -105,8 +104,8 @@ void ExtendedPrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, size_t shared_bytes = 0; hipLaunchKernelGGL(ExtendedPrimitiveTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, (char*)s_buf, - (char*)r_buf, size, _type, _shmem_context); + stream, loop, args.skip, start_time, end_time, + (char*)s_buf, (char*)r_buf, size, _type, _shmem_context); num_msgs = (loop + args.skip) * gridSize.x; num_timed_msgs = loop * gridSize.x; diff --git a/tests/functional_tests/extended_primitives.hpp b/tests/functional_tests/extended_primitives.hpp index 76225f0..090be45 100644 --- a/tests/functional_tests/extended_primitives.hpp +++ b/tests/functional_tests/extended_primitives.hpp @@ -24,7 +24,6 @@ #define _EXTENDED_PRIMITIVES_HPP_ #include "tester.hpp" -#include "../src/util.hpp" /****************************************************************************** * HOST TESTER CLASS diff --git a/tests/functional_tests/fcollect_tester.cpp b/tests/functional_tests/fcollect_tester.cpp index 3eaa8fb..2f8503a 100644 --- a/tests/functional_tests/fcollect_tester.cpp +++ b/tests/functional_tests/fcollect_tester.cpp @@ -20,10 +20,6 @@ * IN THE SOFTWARE. *****************************************************************************/ -using namespace rocshmem; - -rocshmem_team_t team_fcollect_world_dup; - /* Declare the template with a generic implementation */ template __device__ void wg_fcollect(rocshmem_ctx_t ctx, rocshmem_team_t team, T *dest, @@ -58,23 +54,27 @@ FCOLLECT_DEF_GEN(unsigned long long, ulonglong) * DEVICE TEST KERNEL *****************************************************************************/ template -__global__ void FcollectTest(int loop, int skip, uint64_t *timer, - T1 *source_buf, T1 *dest_buf, int size, - ShmemContextType ctx_type, rocshmem_team_t team) { +__global__ void FcollectTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, T1 *source_buf, T1 *dest_buf, + int size, ShmemContextType ctx_type, + rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); int n_pes = rocshmem_ctx_n_pes(ctx); + source_buf += wg_id * size; + dest_buf += wg_id * size * n_pes; + __syncthreads(); - uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } - wg_fcollect(ctx, team, + wg_fcollect(ctx, teams[wg_id], dest_buf, // T* dest source_buf, // const T* source size); // int nelement @@ -83,7 +83,7 @@ __global__ void FcollectTest(int loop, int skip, uint64_t *timer, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -94,29 +94,65 @@ __global__ void FcollectTest(int loop, int skip, uint64_t *timer, * HOST TESTER CLASS METHODS *****************************************************************************/ template -FcollectTester::FcollectTester( - TesterArguments args, std::function f1, - std::function(const T1 &, T1)> f2) - : Tester(args), init_buf{f1}, verify_buf{f2} { +FcollectTester::FcollectTester(TesterArguments args) + : Tester(args) { + int my_pe = rocshmem_team_my_pe(ROCSHMEM_TEAM_WORLD); int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); - dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1) * n_pes); + + int num_elems = (args.max_msg_size / sizeof(T1)) * args.num_wgs ; + int buff_size = num_elems * sizeof(T1); + + source_buf = (T1 *)rocshmem_malloc(buff_size); + dest_buf = (T1 *)rocshmem_malloc(buff_size * n_pes); + + if constexpr (std::is_same::value || + std::is_same::value || + std::is_same::value) { + for (int i = 0; i < num_elems; ++i) { + source_buf[i] = static_cast('a' + my_pe); + } + } + else if constexpr (std::is_floating_point::value) { + for (int i = 0; i < num_elems; ++i) { + source_buf[i] = static_cast(3.14 + my_pe); + } + } + else if constexpr (std::is_integral::value) { + for (int i = 0; i < num_elems; i++) { + source_buf[i] = static_cast(my_pe); + } + } + + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_fcollect_world_dup, + sizeof(rocshmem_team_t) * num_teams)); } template FcollectTester::~FcollectTester() { rocshmem_free(source_buf); rocshmem_free(dest_buf); + CHECK_HIP(hipFree(team_fcollect_world_dup)); } template void FcollectTester::preLaunchKernel() { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - bw_factor = sizeof(T1) * n_pes; - - team_fcollect_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_fcollect_world_dup); + bw_factor = n_pes; + + for (int team_i = 0; team_i < num_teams; team_i++) { + team_fcollect_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_fcollect_world_dup[team_i]); + if (team_fcollect_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + std::cout << "Team " << team_i << " is invalid!" << std::endl; + abort(); + } + } } template @@ -124,42 +160,66 @@ void FcollectTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; + int num_elems = size / sizeof(T1); + + int my_pe = rocshmem_team_my_pe(ROCSHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + hipLaunchKernelGGL(FcollectTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, source_buf, dest_buf, size, + stream, loop, args.skip, start_time, end_time, + source_buf, dest_buf, num_elems, _shmem_context, team_fcollect_world_dup); - num_msgs = loop + args.skip; - num_timed_msgs = loop; + num_msgs = (loop + args.skip) * gridSize.x; + num_timed_msgs = loop * gridSize.x; } template void FcollectTester::postLaunchKernel() { - rocshmem_team_destroy(team_fcollect_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_fcollect_world_dup[team_i]); + } } template void FcollectTester::resetBuffers(uint64_t size) { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - for (int i = 0; i < n_pes; i++) { - for (uint64_t j = 0; j < size; j++) { - // Note: This is redundant work, - // source is being reinitialized multiple times - init_buf(source_buf[j], dest_buf[i * size + j]); - } - } + int num_elems = (size / sizeof(T1)); + int buff_size = num_elems * sizeof(T1) * args.num_wgs * n_pes; + + memset(dest_buf, -1, buff_size); } template void FcollectTester::verifyResults(uint64_t size) { + int my_pe = rocshmem_team_my_pe(ROCSHMEM_TEAM_WORLD); int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - for (int i = 0; i < n_pes; i++) { - for (uint64_t j = 0; j < size; j++) { - auto r = verify_buf(dest_buf[i * size + j], i); - if (r.first == false) { - fprintf(stderr, "Data validation error at idx %lu\n", j); - fprintf(stderr, "%s.\n", r.second.c_str()); - // exit(-1); - return; + + int num_elems = size / sizeof(T1); + int idx = 0; + T1 expected; + + for(int wg_id = 0; wg_id < args.num_wgs; wg_id++) { + for(int pe = 0; pe < n_pes; pe++) { + for(int i = 0; i < num_elems; i++) { + idx = (wg_id * n_pes + pe) * num_elems + i; + if constexpr (std::is_same::value || + std::is_same::value || + std::is_same::value) { + expected = static_cast('a' + pe); + } + else if constexpr (std::is_floating_point::value) { + expected = static_cast(3.14 + pe); + } + else if constexpr (std::is_integral::value) { + expected = pe; + } + if (dest_buf[idx] != expected) { + std::cerr << "Data validation error at idx " << idx << std::endl; + std::cerr << "PE " << my_pe << " Got " << dest_buf[idx] + << ", Expected " << expected << std::endl; + exit(-1); + } } } } diff --git a/tests/functional_tests/fcollect_tester.hpp b/tests/functional_tests/fcollect_tester.hpp index d7cb73a..ffe3873 100644 --- a/tests/functional_tests/fcollect_tester.hpp +++ b/tests/functional_tests/fcollect_tester.hpp @@ -25,18 +25,20 @@ #include #include +#include +#include #include "tester.hpp" +using namespace rocshmem; + /************* ***************************************************************** * HOST TESTER CLASS *****************************************************************************/ template class FcollectTester : public Tester { public: - explicit FcollectTester( - TesterArguments args, std::function f1, - std::function(const T1 &, T1)> f2); + explicit FcollectTester(TesterArguments args); virtual ~FcollectTester(); protected: @@ -54,9 +56,13 @@ class FcollectTester : public Tester { T1 *source_buf; T1 *dest_buf; - private: - std::function init_buf; - std::function(const T1 &, T1)> verify_buf; +private: + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_fcollect_world_dup; }; #include "fcollect_tester.cpp" diff --git a/tests/functional_tests/ping_all_tester.cpp b/tests/functional_tests/ping_all_tester.cpp index 858882c..5f0b5f2 100644 --- a/tests/functional_tests/ping_all_tester.cpp +++ b/tests/functional_tests/ping_all_tester.cpp @@ -29,7 +29,8 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf, +__global__ void PingAllTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, int *r_buf, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; @@ -38,18 +39,18 @@ __global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf, int pe = rocshmem_ctx_my_pe(ctx); int num_pe = rocshmem_ctx_n_pes(ctx); + int wg_id = get_flat_grid_id(); int status[1024]; for (int j{0}; j < num_pe; j++) { status[j] = 0; } if (hipThreadIdx_x == 0) { - uint64_t start; auto blk_pe_off {hipBlockIdx_x * num_pe}; for (int i = 0; i < loop + skip; i++) { if (i == skip) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } for (int j{0}; j < num_pe; j++) { @@ -57,7 +58,7 @@ __global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf, } rocshmem_int_wait_until_all(&r_buf[blk_pe_off], num_pe, status, ROCSHMEM_CMP_EQ, 1); } - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); rocshmem_wg_finalize(); @@ -83,10 +84,11 @@ void PingAllTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(PingAllTest, gridSize, blockSize, shared_bytes, stream, - loop, args.skip, timer, r_buf, _shmem_context); + loop, args.skip, start_time, end_time, r_buf, + _shmem_context); num_msgs = (loop + args.skip) * gridSize.x; - num_timed_msgs = loop; + num_timed_msgs = loop * gridSize.x; } void PingAllTester::verifyResults(uint64_t size) {} diff --git a/tests/functional_tests/ping_all_tester.hpp b/tests/functional_tests/ping_all_tester.hpp index 33f5f19..5d552f7 100644 --- a/tests/functional_tests/ping_all_tester.hpp +++ b/tests/functional_tests/ping_all_tester.hpp @@ -28,7 +28,8 @@ /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PingAllTest(int loop, int skip, uint64_t *timer, int *r_buf); +__global__ void PingAllTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, int *r_buf); /****************************************************************************** * HOST TESTER CLASS diff --git a/tests/functional_tests/ping_pong_tester.cpp b/tests/functional_tests/ping_pong_tester.cpp index f0e6ebb..6cc4f4a 100644 --- a/tests/functional_tests/ping_pong_tester.cpp +++ b/tests/functional_tests/ping_pong_tester.cpp @@ -29,21 +29,22 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PingPongTest(int loop, int skip, uint64_t *timer, int *r_buf, +__global__ void PingPongTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, int *r_buf, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); + int wg_id = get_flat_grid_id(); int pe = rocshmem_ctx_my_pe(ctx); if (hipThreadIdx_x == 0) { - uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } if (pe == 0) { @@ -56,7 +57,7 @@ __global__ void PingPongTest(int loop, int skip, uint64_t *timer, int *r_buf, rocshmem_ctx_int_p(ctx, &r_buf[hipBlockIdx_x], i + 1, 0); } } - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); rocshmem_wg_finalize(); @@ -80,7 +81,8 @@ void PingPongTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(PingPongTest, gridSize, blockSize, shared_bytes, stream, - loop, args.skip, timer, r_buf, _shmem_context); + loop, args.skip, start_time, end_time, r_buf, + _shmem_context); num_msgs = (loop + args.skip) * gridSize.x; num_timed_msgs = loop; diff --git a/tests/functional_tests/ping_pong_tester.hpp b/tests/functional_tests/ping_pong_tester.hpp index 0669e78..b893bf0 100644 --- a/tests/functional_tests/ping_pong_tester.hpp +++ b/tests/functional_tests/ping_pong_tester.hpp @@ -28,7 +28,8 @@ /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PingPongTest(int loop, int skip, uint64_t *timer, int *r_buf); +__global__ void PingPongTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, int *r_buf); /****************************************************************************** * HOST TESTER CLASS diff --git a/tests/functional_tests/primitive_mr_tester.cpp b/tests/functional_tests/primitive_mr_tester.cpp index bec7a1c..9d43aa5 100644 --- a/tests/functional_tests/primitive_mr_tester.cpp +++ b/tests/functional_tests/primitive_mr_tester.cpp @@ -29,17 +29,17 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PrimitiveMRTest(int loop, uint64_t *timer, char *s_buf, - char *r_buf, int size, - ShmemContextType ctx_type) { +__global__ void PrimitiveMRTest(int loop, uint64_t *start_time, + uint64_t *end_time, char *s_buf, char *r_buf, + int size, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); + int wg_id = get_flat_grid_id(); if (hipThreadIdx_x == 0) { - uint64_t start; - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); for (int win_i = 0; win_i < 64 * loop; win_i++) { for (int i = 0; i < 64; i++) { @@ -48,7 +48,7 @@ __global__ void PrimitiveMRTest(int loop, uint64_t *timer, char *s_buf, rocshmem_ctx_quiet(ctx); } - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } __syncthreads(); @@ -80,12 +80,14 @@ void PrimitiveMRTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; /* Warmup */ - hipLaunchKernelGGL(PrimitiveMRTest, gridSize, blockSize, shared_bytes, stream, - loop, timer, s_buf, r_buf, size, _shmem_context); + hipLaunchKernelGGL(PrimitiveMRTest, gridSize, blockSize, shared_bytes, + stream, loop, start_time, end_time, s_buf, r_buf, size, + _shmem_context); /* Benchmark */ - hipLaunchKernelGGL(PrimitiveMRTest, gridSize, blockSize, shared_bytes, stream, - loop, timer, s_buf, r_buf, size, _shmem_context); + hipLaunchKernelGGL(PrimitiveMRTest, gridSize, blockSize, shared_bytes, + stream, loop, start_time, end_time, s_buf, r_buf, size, + _shmem_context); CHECK_HIP(hipDeviceSynchronize()); diff --git a/tests/functional_tests/primitive_tester.cpp b/tests/functional_tests/primitive_tester.cpp index 1b66664..a5d58e3 100644 --- a/tests/functional_tests/primitive_tester.cpp +++ b/tests/functional_tests/primitive_tester.cpp @@ -29,19 +29,26 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void PrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, - char *r_buf, int size, TestType type, +__global__ void PrimitiveTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *s_buf, char *r_buf, + int size, TestType type, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); + int wg_id = get_flat_grid_id(); - uint64_t start; + /** + * Calculate start index for each thread within the grid + */ + uint64_t idx = size * get_flat_id(); + s_buf += idx; + r_buf += idx; for (int i = 0; i < loop + skip; i++) { if (i == skip) { __syncthreads(); - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } switch (type) { @@ -79,7 +86,7 @@ __global__ void PrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -90,8 +97,9 @@ __global__ void PrimitiveTest(int loop, int skip, uint64_t *timer, char *s_buf, * HOST TESTER CLASS METHODS *****************************************************************************/ PrimitiveTester::PrimitiveTester(TesterArguments args) : Tester(args) { - s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + size_t buff_size = args.max_msg_size * args.wg_size * args.num_wgs; + s_buf = (char *)rocshmem_malloc(buff_size); + r_buf = (char *)rocshmem_malloc(buff_size); } PrimitiveTester::~PrimitiveTester() { @@ -100,8 +108,9 @@ PrimitiveTester::~PrimitiveTester() { } void PrimitiveTester::resetBuffers(uint64_t size) { - memset(s_buf, '0', args.max_msg_size * args.wg_size); - memset(r_buf, '1', args.max_msg_size * args.wg_size); + size_t buff_size = size * args.wg_size * args.num_wgs; + memset(s_buf, '0', buff_size); + memset(r_buf, '1', buff_size); } void PrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, @@ -109,11 +118,11 @@ void PrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(PrimitiveTest, gridSize, blockSize, shared_bytes, stream, - loop, args.skip, timer, s_buf, r_buf, size, _type, - _shmem_context); + loop, args.skip, start_time, end_time, s_buf, r_buf, + size, _type, _shmem_context); - num_msgs = (loop + args.skip) * gridSize.x; - num_timed_msgs = loop; + num_msgs = (loop + args.skip) * gridSize.x * blockSize.x; + num_timed_msgs = loop * gridSize.x * blockSize.x; } void PrimitiveTester::verifyResults(uint64_t size) { @@ -123,7 +132,8 @@ void PrimitiveTester::verifyResults(uint64_t size) { : 1; if (args.myid == check_id) { - for (uint64_t i = 0; i < size; i++) { + size_t buff_size = size * args.wg_size * args.num_wgs; + for (uint64_t i = 0; i < buff_size; i++) { if (r_buf[i] != '0') { fprintf(stderr, "Data validation error at idx %lu\n", i); fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0'); diff --git a/tests/functional_tests/shmem_ptr_tester.cpp b/tests/functional_tests/shmem_ptr_tester.cpp index 5643548..59f856a 100644 --- a/tests/functional_tests/shmem_ptr_tester.cpp +++ b/tests/functional_tests/shmem_ptr_tester.cpp @@ -76,11 +76,13 @@ void ShmemPtrTester::verifyResults(uint64_t size) { if (args.myid == 0) { if (*_available == 0) { fprintf(stderr, "SHMEM_PTR NOT AVAILBLE \n"); + exit(-1); } } else { if (r_buf[4] != '1') { fprintf(stderr, "Data validation error \n"); fprintf(stderr, "Got %c, Expected %c\n", r_buf[4], '1'); + exit(-1); } } } diff --git a/tests/functional_tests/signaling_operations_tester.cpp b/tests/functional_tests/signaling_operations_tester.cpp index b15f383..52e218a 100644 --- a/tests/functional_tests/signaling_operations_tester.cpp +++ b/tests/functional_tests/signaling_operations_tester.cpp @@ -29,42 +29,89 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void SignalingOperationsTest(int loop, int skip, uint64_t *timer, char *s_buf, - char *r_buf, int size, uint64_t *sig_addr, - uint64_t *fetched_value, +__global__ void SignalingOperationsTest(int loop, int skip, + uint64_t *start_time, uint64_t *end_time, + int wf_size, char *s_buf, char *r_buf, int size, + uint64_t *sig_addr, uint64_t *fetched_value, TestType type, ShmemContextType ctx_type) { __shared__ rocshmem_ctx_t ctx; rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); - uint64_t start; uint64_t signal = 0; int sig_op = ROCSHMEM_SIGNAL_SET; + int idx = 0; + int wf_id = 0; + int wg_id = get_flat_grid_id(); + int wg_offset = 0; + + switch (type) { + case PutSignalTestType: + case PutSignalNBITestType: + case SignalFetchTestType: + /** + * Calculate start index for each thread within the grid + */ + idx = get_flat_id(); + break; + case WGPutSignalTestType: + case WGPutSignalNBITestType: + case WGSignalFetchTestType: + /** + * Calculate start index for each work group + */ + idx = get_flat_grid_id(); + break; + case WAVEPutSignalTestType: + case WAVEPutSignalNBITestType: + case WAVESignalFetchTestType: + /** + * Calculate start index for each wavefront + */ + wf_id = get_flat_block_id() / wf_size; + wg_offset = get_flat_grid_id() * + ((get_flat_block_size() - 1) / wf_size + 1); + idx = wf_id + wg_offset; + break; + default: + break; + } + + s_buf += size * idx; + r_buf += size * idx; + sig_addr += idx; + fetched_value += idx; for (int i = 0; i < loop + skip; i++) { if (i == skip) { __syncthreads(); - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } switch (type) { case PutSignalTestType: - rocshmem_ctx_putmem_signal(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case WGPutSignalTestType: - rocshmem_ctx_putmem_signal_wg(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal_wg(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case WAVEPutSignalTestType: - rocshmem_ctx_putmem_signal_wave(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal_wave(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case PutSignalNBITestType: - rocshmem_ctx_putmem_signal_nbi(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal_nbi(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case WGPutSignalNBITestType: - rocshmem_ctx_putmem_signal_nbi_wg(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal_nbi_wg(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case WAVEPutSignalNBITestType: - rocshmem_ctx_putmem_signal_nbi_wave(ctx, r_buf, s_buf, size, sig_addr, signal, sig_op, 1); + rocshmem_ctx_putmem_signal_nbi_wave(ctx, r_buf, s_buf, size, sig_addr, + signal, sig_op, 1); break; case SignalFetchTestType: *fetched_value = rocshmem_signal_fetch(sig_addr); @@ -85,7 +132,7 @@ __global__ void SignalingOperationsTest(int loop, int skip, uint64_t *timer, cha __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -95,11 +142,46 @@ __global__ void SignalingOperationsTest(int loop, int skip, uint64_t *timer, cha /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ -SignalingOperationsTester::SignalingOperationsTester(TesterArguments args) : Tester(args) { - s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); - sig_addr = (uint64_t *)rocshmem_malloc(sizeof(uint64_t)); - CHECK_HIP(hipMallocManaged(&fetched_value, sizeof(uint64_t), hipMemAttachHost)); +SignalingOperationsTester::SignalingOperationsTester(TesterArguments args) + : Tester(args) { + + type = (TestType)args.algorithm; + + switch (type) { + case PutSignalTestType: + case PutSignalNBITestType: + buff_size = args.max_msg_size * args.num_wgs * args.wg_size; + num_signals = args.num_wgs * args.wg_size; + break; + case WAVEPutSignalTestType: + case WAVEPutSignalNBITestType: + buff_size = args.max_msg_size * args.num_wgs * num_warps; + num_signals = args.num_wgs * num_warps; + break; + case WGPutSignalTestType: + case WGPutSignalNBITestType: + buff_size = args.max_msg_size * args.num_wgs; + num_signals = args.num_wgs; + break; + case SignalFetchTestType: + num_signals = args.num_wgs * args.wg_size; + break; + case WAVESignalFetchTestType: + num_signals = args.num_wgs * num_warps; + break; + case WGSignalFetchTestType: + num_signals = args.num_wgs; + break; + default: + break; + } + + s_buf = (char *)rocshmem_malloc(buff_size); + r_buf = (char *)rocshmem_malloc(buff_size); + sig_addr = (uint64_t *)rocshmem_malloc(sizeof(uint64_t) * num_signals); + CHECK_HIP(hipMallocManaged(&fetched_value, sizeof(uint64_t) * num_signals, + hipMemAttachHost)); + } SignalingOperationsTester::~SignalingOperationsTester() { @@ -110,22 +192,50 @@ SignalingOperationsTester::~SignalingOperationsTester() { } void SignalingOperationsTester::resetBuffers(uint64_t size) { - memset(s_buf, '0', args.max_msg_size * args.wg_size); - memset(r_buf, '1', args.max_msg_size * args.wg_size); - *fetched_value = -1; - *sig_addr = args.myid + 123; + memset(s_buf, '0', buff_size); + memset(r_buf, '1', buff_size); + for (int i = 0; i < num_signals; i++) { + fetched_value[i] = -1; + sig_addr[i] = args.myid + 123; + } } -void SignalingOperationsTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, - uint64_t size) { +void SignalingOperationsTester::launchKernel(dim3 gridSize, dim3 blockSize, + int loop, uint64_t size) { size_t shared_bytes = 0; - hipLaunchKernelGGL(SignalingOperationsTest, gridSize, blockSize, shared_bytes, stream, - loop, args.skip, timer, s_buf, r_buf, size, sig_addr, fetched_value, - _type, _shmem_context); + hipLaunchKernelGGL(SignalingOperationsTest, gridSize, blockSize, + shared_bytes, stream, loop, args.skip, start_time, + end_time, wf_size, s_buf, r_buf, size, sig_addr, + fetched_value, _type, _shmem_context); - num_msgs = (loop + args.skip) * gridSize.x; + num_msgs = (loop + args.skip); num_timed_msgs = loop; + + switch (type) { + case PutSignalTestType: + case PutSignalNBITestType: + case SignalFetchTestType: + num_msgs *= gridSize.x * blockSize.x; + num_timed_msgs *= gridSize.x * blockSize.x; + break; + case WAVEPutSignalTestType: + case WAVEPutSignalNBITestType: + case WAVESignalFetchTestType: + num_msgs *= gridSize.x * num_warps; + num_timed_msgs *= gridSize.x * num_warps; + break; + case WGPutSignalTestType: + case WGPutSignalNBITestType: + case WGSignalFetchTestType: + num_msgs *= gridSize.x; + num_timed_msgs *= gridSize.x; + break; + default: + num_msgs = (loop + args.skip); + num_timed_msgs = loop; + break; + } } void SignalingOperationsTester::verifyResults(uint64_t size) { @@ -142,6 +252,23 @@ void SignalingOperationsTester::verifyResults(uint64_t size) { _type == WGSignalFetchTestType) ? 0 : -1; // do not check if it doesn't match a test + switch (type) { + case PutSignalTestType: + case PutSignalNBITestType: + size *= args.num_wgs * args.wg_size; + break; + case WAVEPutSignalTestType: + case WAVEPutSignalNBITestType: + size *= args.num_wgs * num_warps; + break; + case WGPutSignalTestType: + case WGPutSignalNBITestType: + size *= args.num_wgs; + break; + default: + break; + } + if (args.myid == check_data_id) { for (uint64_t i = 0; i < size; i++) { if (r_buf[i] != '0') { @@ -150,14 +277,21 @@ void SignalingOperationsTester::verifyResults(uint64_t size) { exit(-1); } } + for (int i = 0; i < num_signals; i++) { + if (sig_addr[i] != 0) { + fprintf(stderr, "Signal Value %lu, Expected 0\n", sig_addr[i]); + exit(-1); + } + } } if (args.myid == check_fetched_value_id) { - uint64_t value = *fetched_value; uint64_t expected_value = (args.myid + 123); - if (value != expected_value) { - fprintf(stderr, "Fetched Value %lu, Expected %lu\n", value, expected_value); - exit(-1); + for (int i = 0; i < num_signals; i++) { + if (fetched_value[i] != expected_value) { + fprintf(stderr, "Fetched Value %lu, Expected %lu\n", fetched_value[i], expected_value); + exit(-1); + } } } } diff --git a/tests/functional_tests/signaling_operations_tester.hpp b/tests/functional_tests/signaling_operations_tester.hpp index acff9ea..cec8d49 100644 --- a/tests/functional_tests/signaling_operations_tester.hpp +++ b/tests/functional_tests/signaling_operations_tester.hpp @@ -43,8 +43,11 @@ class SignalingOperationsTester : public Tester { char *s_buf = nullptr; char *r_buf = nullptr; - uint64_t *sig_addr; - uint64_t *fetched_value; + uint64_t *sig_addr = nullptr; + uint64_t *fetched_value = nullptr; + size_t buff_size = 0; + int num_signals = 0; + TestType type; }; #endif diff --git a/tests/functional_tests/sync_tester.cpp b/tests/functional_tests/sync_tester.cpp index d481862..dc243d7 100644 --- a/tests/functional_tests/sync_tester.cpp +++ b/tests/functional_tests/sync_tester.cpp @@ -22,33 +22,31 @@ #include "sync_tester.hpp" -#include - -using namespace rocshmem; -rocshmem_team_t team_sync_world_dup; - /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void SyncTest(int loop, int skip, uint64_t *timer, TestType type, - ShmemContextType ctx_type, rocshmem_team_t team) { +__global__ void SyncTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, TestType type, + ShmemContextType ctx_type, rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); + rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); - uint64_t start; for (int i = 0; i < loop + skip; i++) { + __syncthreads(); if (hipThreadIdx_x == 0 && i == skip) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } - __syncthreads(); switch (type) { case SyncAllTestType: - rocshmem_ctx_wg_sync_all(ctx); + if (is_block_zero_in_grid()) + rocshmem_ctx_wg_sync_all(ctx); break; case SyncTestType: - rocshmem_ctx_wg_team_sync(ctx, team); + rocshmem_ctx_wg_team_sync(ctx, teams[wg_id]); break; default: break; @@ -57,7 +55,7 @@ __global__ void SyncTest(int loop, int skip, uint64_t *timer, TestType type, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -67,28 +65,62 @@ __global__ void SyncTest(int loop, int skip, uint64_t *timer, TestType type, /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ -SyncTester::SyncTester(TesterArguments args) : Tester(args) {} +SyncTester::SyncTester(TesterArguments args) : Tester(args) { -SyncTester::~SyncTester() {} + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_sync_world_dup, + sizeof(rocshmem_team_t) * num_teams)); +} + +SyncTester::~SyncTester() { + CHECK_HIP(hipFree(team_sync_world_dup)); +} void SyncTester::resetBuffers(uint64_t size) {} +void SyncTester::preLaunchKernel() { + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + + for (int team_i = 0; team_i < num_teams; team_i++) { + team_sync_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_sync_world_dup[team_i]); + if (team_sync_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + printf("Team %d is invalid!\n", team_i); + abort(); + } + } +} + void SyncTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_sync_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_sync_world_dup); + hipLaunchKernelGGL(SyncTest, gridSize, blockSize, shared_bytes, stream, + loop, args.skip, start_time, end_time, _type, + _shmem_context, team_sync_world_dup); - hipLaunchKernelGGL(SyncTest, gridSize, blockSize, shared_bytes, stream, loop, - args.skip, timer, _type, _shmem_context, - team_sync_world_dup); - num_msgs = (loop + args.skip) * gridSize.x; + num_msgs = (loop + args.skip); num_timed_msgs = loop; + + TestType type = (TestType)args.algorithm; + if( type == SyncTestType) { + num_msgs = (loop + args.skip) * gridSize.x; + num_timed_msgs = loop * gridSize.x; + } +} + +void SyncTester::postLaunchKernel() { + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_sync_world_dup[team_i]); + } } void SyncTester::verifyResults(uint64_t size) {} diff --git a/tests/functional_tests/sync_tester.hpp b/tests/functional_tests/sync_tester.hpp index 764fd5e..ec4b09d 100644 --- a/tests/functional_tests/sync_tester.hpp +++ b/tests/functional_tests/sync_tester.hpp @@ -24,6 +24,9 @@ #define _SYNC_TESTER_HPP_ #include "tester.hpp" +#include + +using namespace rocshmem; /****************************************************************************** * HOST TESTER CLASS @@ -36,10 +39,22 @@ class SyncTester : public Tester { protected: virtual void resetBuffers(uint64_t size) override; + virtual void preLaunchKernel() override; + virtual void launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) override; + virtual void postLaunchKernel() override; + virtual void verifyResults(uint64_t size) override; + +private: + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_sync_world_dup; }; #endif diff --git a/tests/functional_tests/team_broadcast_tester.cpp b/tests/functional_tests/team_broadcast_tester.cpp index d9e1db4..2a677e4 100644 --- a/tests/functional_tests/team_broadcast_tester.cpp +++ b/tests/functional_tests/team_broadcast_tester.cpp @@ -20,8 +20,6 @@ * IN THE SOFTWARE. *****************************************************************************/ -using namespace rocshmem; - /* Declare the template with a generic implementation */ template __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team, @@ -33,8 +31,8 @@ __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, rocshmem_team_t team, /* Define templates to call ROCSHMEM */ #define TEAM_BROADCAST_DEF_GEN(T, TNAME) \ template <> \ - __device__ void wg_team_broadcast( \ - rocshmem_ctx_t ctx, rocshmem_team_t team, T * dest, const T *source, \ + __device__ void wg_team_broadcast(rocshmem_ctx_t ctx, \ + rocshmem_team_t team, T * dest, const T *source, \ int nelem, int pe_root) { \ rocshmem_ctx_##TNAME##_wg_broadcast(ctx, team, dest, source, nelem, \ pe_root); \ @@ -55,32 +53,34 @@ TEAM_BROADCAST_DEF_GEN(unsigned int, uint) TEAM_BROADCAST_DEF_GEN(unsigned long, ulong) TEAM_BROADCAST_DEF_GEN(unsigned long long, ulonglong) -rocshmem_team_t team_bcast_world_dup; - /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ template -__global__ void TeamBroadcastTest(int loop, int skip, uint64_t *timer, - T1 *source_buf, T1 *dest_buf, int size, +__global__ void TeamBroadcastTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, T1 *source_buf, + T1 *dest_buf, int size, ShmemContextType ctx_type, - rocshmem_team_t team) { + rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); int n_pes = rocshmem_ctx_n_pes(ctx); + source_buf += wg_id * size; + dest_buf += wg_id * size; __syncthreads(); uint64_t start; - for (int i = 0; i < loop; i++) { + for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } - wg_team_broadcast(ctx, team, + wg_team_broadcast(ctx, teams[wg_id], dest_buf, // T* dest source_buf, // const T* source size, // int nelement @@ -91,7 +91,7 @@ __global__ void TeamBroadcastTest(int loop, int skip, uint64_t *timer, __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -102,27 +102,44 @@ __global__ void TeamBroadcastTest(int loop, int skip, uint64_t *timer, * HOST TESTER CLASS METHODS *****************************************************************************/ template -TeamBroadcastTester::TeamBroadcastTester( - TesterArguments args, std::function f1, - std::function(const T1 &)> f2) - : Tester(args), init_buf{f1}, verify_buf{f2} { - source_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); - dest_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); +TeamBroadcastTester::TeamBroadcastTester(TesterArguments args) + : Tester(args){ + + int num_elems = (args.max_msg_size / sizeof(T1)) * args.num_wgs ; + int buff_size = num_elems * sizeof(T1); + + source_buf = (T1 *)rocshmem_malloc(buff_size); + dest_buf = (T1 *)rocshmem_malloc(buff_size); + + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_bcast_world_dup, + sizeof(rocshmem_team_t) * num_teams)); } template TeamBroadcastTester::~TeamBroadcastTester() { rocshmem_free(source_buf); rocshmem_free(dest_buf); + CHECK_HIP(hipFree(team_bcast_world_dup)); } template void TeamBroadcastTester::preLaunchKernel() { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_bcast_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_bcast_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + team_bcast_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_bcast_world_dup[team_i]); + if (team_bcast_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + printf("Team %d is invalid!\n", team_i); + abort(); + } + } } template @@ -130,34 +147,93 @@ void TeamBroadcastTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; - hipLaunchKernelGGL(TeamBroadcastTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, source_buf, dest_buf, size, - _shmem_context, team_bcast_world_dup); + int num_elems = size / sizeof(T1); + + hipLaunchKernelGGL(TeamBroadcastTest, gridSize, blockSize, + shared_bytes, stream, loop, args.skip, + start_time, end_time, source_buf, dest_buf, + num_elems, _shmem_context, team_bcast_world_dup); - num_msgs = loop + args.skip; - num_timed_msgs = loop; + num_msgs = (loop + args.skip) * gridSize.x; + num_timed_msgs = loop * gridSize.x; } template void TeamBroadcastTester::postLaunchKernel() { - rocshmem_team_destroy(team_bcast_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_bcast_world_dup[team_i]); + } } template void TeamBroadcastTester::resetBuffers(uint64_t size) { - for (uint64_t i = 0; i < args.max_msg_size; i++) { - init_buf(source_buf[i], dest_buf[i]); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + + int num_elems = size / sizeof(T1); + int buff_size = num_elems * sizeof(T1) * args.num_wgs; + int idx = 0; + + for (int wg_id = 0; wg_id < args.num_wgs; wg_id++) { + for (int i = 0; i < num_elems; i++) { + idx = wg_id * num_elems + i; + if constexpr (std::is_same::value || + std::is_same::value || + std::is_same::value) { + source_buf[idx] = static_cast('a' + n_pes + wg_id); + dest_buf[idx] = static_cast('a' + wg_id); + } + else if constexpr (std::is_floating_point::value) { + source_buf[idx] = static_cast(3.14 + n_pes + wg_id); + dest_buf[idx] = static_cast(3.14 + wg_id); + } + else if constexpr (std::is_integral::value) { + source_buf[idx] = static_cast(n_pes + wg_id); + dest_buf[idx] = static_cast(wg_id); + } + } } } template void TeamBroadcastTester::verifyResults(uint64_t size) { - for (uint64_t i = 0; i < size; i++) { - auto r = verify_buf(dest_buf[i]); - if (r.first == false) { - fprintf(stderr, "Data validation error at idx %lu\n", i); - fprintf(stderr, "%s.\n", r.second.c_str()); - exit(-1); + int my_pe = rocshmem_team_my_pe(ROCSHMEM_TEAM_WORLD); + int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); + + int num_elems = size / sizeof(T1); + int idx = 0; + T1 expected; + + /** + * The verification routine here requires that the + * PE_root value is 0 which denotes that the + * sending processing element is rank 0. + * + * The difference in expected values arises from + * the specification for broadcast where the + * PE_root processing element does not copy the + * contents from its own source to dest during + * the broadcast. + */ + for (int wg_id = 0; wg_id < args.num_wgs; wg_id++) { + for (int i = 0; i < num_elems; i++) { + idx = wg_id * num_elems + i; + if constexpr (std::is_same::value || + std::is_same::value || + std::is_same::value) { + expected = static_cast('a' + wg_id + (my_pe ? n_pes : 0)); + } + else if constexpr (std::is_floating_point::value) { + expected = static_cast(3.14 + wg_id + (my_pe ? n_pes : 0)); + } + else if constexpr (std::is_integral::value) { + expected = static_cast(wg_id + (my_pe ? n_pes : 0)); + } + if (dest_buf[idx] != expected) { + std::cerr << "Data validation error at idx " << idx << std::endl; + std::cerr << "PE " << my_pe << " Got " << dest_buf[idx] + << ", Expected " << expected << std::endl; + exit(-1); + } } } } diff --git a/tests/functional_tests/team_broadcast_tester.hpp b/tests/functional_tests/team_broadcast_tester.hpp index c2bd005..a82af51 100644 --- a/tests/functional_tests/team_broadcast_tester.hpp +++ b/tests/functional_tests/team_broadcast_tester.hpp @@ -28,15 +28,15 @@ #include "tester.hpp" +using namespace rocshmem; + /************* ***************************************************************** * HOST TESTER CLASS *****************************************************************************/ template class TeamBroadcastTester : public Tester { public: - explicit TeamBroadcastTester( - TesterArguments args, std::function f1, - std::function(const T1 &)> f2); + explicit TeamBroadcastTester(TesterArguments args); virtual ~TeamBroadcastTester(); protected: @@ -54,9 +54,13 @@ class TeamBroadcastTester : public Tester { T1 *source_buf; T1 *dest_buf; - private: - std::function init_buf; - std::function(const T1 &)> verify_buf; +private: + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_bcast_world_dup; }; #include "team_broadcast_tester.cpp" diff --git a/tests/functional_tests/team_ctx_infra_tester.cpp b/tests/functional_tests/team_ctx_infra_tester.cpp index 7920ade..0197cd2 100644 --- a/tests/functional_tests/team_ctx_infra_tester.cpp +++ b/tests/functional_tests/team_ctx_infra_tester.cpp @@ -91,16 +91,27 @@ __global__ void TeamCtxInfraTest(ShmemContextType ctx_type, /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ -TeamCtxInfraTester::TeamCtxInfraTester(TesterArguments args) : Tester(args) {} +TeamCtxInfraTester::TeamCtxInfraTester(TesterArguments args) : Tester(args) { -TeamCtxInfraTester::~TeamCtxInfraTester() {} + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_world_dup, + sizeof(rocshmem_team_t) * num_teams)); +} + +TeamCtxInfraTester::~TeamCtxInfraTester() { + CHECK_HIP(hipFree(team_world_dup)); +} void TeamCtxInfraTester::resetBuffers(uint64_t size) {} void TeamCtxInfraTester::preLaunchKernel() { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { + for (int team_i = 0; team_i < num_teams; team_i++) { team_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, &team_world_dup[team_i]); @@ -124,20 +135,12 @@ void TeamCtxInfraTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; - /* Copy array of teams to device */ - rocshmem_team_t *teams_on_device; - CHECK_HIP(hipMalloc(&teams_on_device, sizeof(rocshmem_team_t) * NUM_TEAMS)); - CHECK_HIP(hipMemcpy(teams_on_device, team_world_dup, - sizeof(rocshmem_team_t) * NUM_TEAMS, hipMemcpyHostToDevice)); - hipLaunchKernelGGL(TeamCtxInfraTest, gridSize, blockSize, shared_bytes, - stream, _shmem_context, teams_on_device); - - CHECK_HIP(hipFree(teams_on_device)); + stream, _shmem_context, team_world_dup); } void TeamCtxInfraTester::postLaunchKernel() { - for (int team_i = 0; team_i < NUM_TEAMS; team_i++) { + for (int team_i = 0; team_i < num_teams; team_i++) { rocshmem_team_destroy(team_world_dup[team_i]); } } diff --git a/tests/functional_tests/team_ctx_infra_tester.hpp b/tests/functional_tests/team_ctx_infra_tester.hpp index b59d1ba..475d0d6 100644 --- a/tests/functional_tests/team_ctx_infra_tester.hpp +++ b/tests/functional_tests/team_ctx_infra_tester.hpp @@ -24,6 +24,9 @@ #define _TEAM_CTX_INFRA_TESTER_HPP_ #include "tester.hpp" +#include + +using namespace rocshmem; /****************************************************************************** * HOST TESTER CLASS @@ -47,6 +50,14 @@ class TeamCtxInfraTester : public Tester { char *s_buf = nullptr; char *r_buf = nullptr; + +private: + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_world_dup; }; #endif diff --git a/tests/functional_tests/team_ctx_primitive_tester.cpp b/tests/functional_tests/team_ctx_primitive_tester.cpp index 7f04eb7..29042ab 100644 --- a/tests/functional_tests/team_ctx_primitive_tester.cpp +++ b/tests/functional_tests/team_ctx_primitive_tester.cpp @@ -22,50 +22,55 @@ #include "team_ctx_primitive_tester.hpp" -#include - -using namespace rocshmem; - -rocshmem_team_t team_primitive_world_dup; - /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void TeamCtxPrimitiveTest(int loop, int skip, uint64_t *timer, - char *s_buf, char *r_buf, int size, - TestType type, ShmemContextType ctx_type, - rocshmem_team_t team) { +__global__ void TeamCtxPrimitiveTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, char *s_buf, + char *r_buf, int size, TestType type, + ShmemContextType ctx_type, + rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); rocshmem_wg_init(); - rocshmem_wg_team_create_ctx(team, ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); + + + // Calculate start index for each thread within the grid + uint64_t idx = size * get_flat_id(); + s_buf += idx; + r_buf += idx; + + for (int i = 0; i < loop + skip; i++) { + if (i == skip) { + __syncthreads(); + start_time[wg_id] = wall_clock64(); + } - if (hipThreadIdx_x == 0) { - uint64_t start; - - for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = rocshmem_timer(); - - switch (type) { - case TeamCtxGetTestType: - rocshmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); - break; - case TeamCtxGetNBITestType: - rocshmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); - break; - case TeamCtxPutTestType: - rocshmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); - break; - case TeamCtxPutNBITestType: - rocshmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); - break; - default: - break; - } + switch (type) { + case TeamCtxGetTestType: + rocshmem_ctx_getmem(ctx, r_buf, s_buf, size, 1); + break; + case TeamCtxGetNBITestType: + rocshmem_ctx_getmem_nbi(ctx, r_buf, s_buf, size, 1); + break; + case TeamCtxPutTestType: + rocshmem_ctx_putmem(ctx, r_buf, s_buf, size, 1); + break; + case TeamCtxPutNBITestType: + rocshmem_ctx_putmem_nbi(ctx, r_buf, s_buf, size, 1); + break; + default: + break; } + } + + rocshmem_ctx_quiet(ctx); - rocshmem_ctx_quiet(ctx); + __syncthreads(); - timer[hipBlockIdx_x] = rocshmem_timer() - start; + if (hipThreadIdx_x == 0) { + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -77,42 +82,62 @@ __global__ void TeamCtxPrimitiveTest(int loop, int skip, uint64_t *timer, *****************************************************************************/ TeamCtxPrimitiveTester::TeamCtxPrimitiveTester(TesterArguments args) : Tester(args) { - s_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); - r_buf = (char *)rocshmem_malloc(args.max_msg_size * args.wg_size); + size_t buff_size = args.max_msg_size * args.wg_size * args.num_wgs; + s_buf = (char *)rocshmem_malloc(buff_size); + r_buf = (char *)rocshmem_malloc(buff_size); + + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_primitive_world_dup, + sizeof(rocshmem_team_t) * num_teams)); } TeamCtxPrimitiveTester::~TeamCtxPrimitiveTester() { rocshmem_free(s_buf); rocshmem_free(r_buf); + CHECK_HIP(hipFree(team_primitive_world_dup)); } void TeamCtxPrimitiveTester::resetBuffers(uint64_t size) { - memset(s_buf, '0', args.max_msg_size * args.wg_size); - memset(r_buf, '1', args.max_msg_size * args.wg_size); + size_t buff_size = size * args.wg_size * args.num_wgs; + memset(s_buf, '0', buff_size); + memset(r_buf, '1', buff_size); } void TeamCtxPrimitiveTester::preLaunchKernel() { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_primitive_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_primitive_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + team_primitive_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_primitive_world_dup[team_i]); + if (team_primitive_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + printf("Team %d is invalid!\n", team_i); + abort(); + } + } } void TeamCtxPrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop, uint64_t size) { size_t shared_bytes = 0; - hipLaunchKernelGGL(TeamCtxPrimitiveTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, s_buf, r_buf, size, _type, + hipLaunchKernelGGL(TeamCtxPrimitiveTest, gridSize, blockSize, + shared_bytes, stream, loop, args.skip, start_time, + end_time, s_buf, r_buf, size, _type, _shmem_context, team_primitive_world_dup); - num_msgs = (loop + args.skip) * gridSize.x; - num_timed_msgs = loop * gridSize.x; + num_msgs = (loop + args.skip) * gridSize.x * blockSize.x; + num_timed_msgs = loop * gridSize.x * blockSize.x; } void TeamCtxPrimitiveTester::postLaunchKernel() { - rocshmem_team_destroy(team_primitive_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_primitive_world_dup[team_i]); + } } void TeamCtxPrimitiveTester::verifyResults(uint64_t size) { @@ -120,7 +145,8 @@ void TeamCtxPrimitiveTester::verifyResults(uint64_t size) { (_type == TeamCtxGetTestType || _type == TeamCtxGetNBITestType) ? 0 : 1; if (args.myid == check_id) { - for (uint64_t i = 0; i < size; i++) { + size_t buff_size = size * args.wg_size * args.num_wgs; + for (uint64_t i = 0; i < buff_size; i++) { if (r_buf[i] != '0') { fprintf(stderr, "Data validation error at idx %lu\n", i); fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0'); diff --git a/tests/functional_tests/team_ctx_primitive_tester.hpp b/tests/functional_tests/team_ctx_primitive_tester.hpp index fa4c6ac..440ef4f 100644 --- a/tests/functional_tests/team_ctx_primitive_tester.hpp +++ b/tests/functional_tests/team_ctx_primitive_tester.hpp @@ -24,6 +24,9 @@ #define _TEAM_CTX_PRIMITIVE_TESTER_HPP_ #include "tester.hpp" +#include + +using namespace rocshmem; /****************************************************************************** * HOST TESTER CLASS @@ -47,6 +50,14 @@ class TeamCtxPrimitiveTester : public Tester { char *s_buf = nullptr; char *r_buf = nullptr; + +private: + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_primitive_world_dup; }; #endif diff --git a/tests/functional_tests/team_reduction_tester.cpp b/tests/functional_tests/team_reduction_tester.cpp index 683c329..1f3b2bd 100644 --- a/tests/functional_tests/team_reduction_tester.cpp +++ b/tests/functional_tests/team_reduction_tester.cpp @@ -20,8 +20,6 @@ * IN THE SOFTWARE. *****************************************************************************/ -using namespace rocshmem; - /* Declare the template with a generic implementation */ template __device__ int wg_team_reduce(rocshmem_ctx_t ctx, rocshmem_team_t, T *dest, @@ -67,38 +65,37 @@ TEAM_FLOAT_REDUCTION_DEF_GEN(double, double) // so disable it for now. // FLOAT_REDUCTION_DEF_GEN(long double, longdouble) -rocshmem_team_t team_reduce_world_dup; - /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ template -__global__ void TeamReductionTest(int loop, int skip, uint64_t *timer, - T1 *s_buf, T1 *r_buf, int size, TestType type, +__global__ void TeamReductionTest(int loop, int skip, uint64_t *start_time, + uint64_t *end_time, T1 *s_buf, T1 *r_buf, + int size, TestType type, ShmemContextType ctx_type, - rocshmem_team_t team) { + rocshmem_team_t *teams) { __shared__ rocshmem_ctx_t ctx; + int wg_id = get_flat_grid_id(); rocshmem_wg_init(); - rocshmem_wg_ctx_create(ctx_type, &ctx); + rocshmem_wg_team_create_ctx(teams[wg_id], ctx_type, &ctx); int n_pes = rocshmem_ctx_n_pes(ctx); __syncthreads(); - uint64_t start; for (int i = 0; i < loop + skip; i++) { if (i == skip && hipThreadIdx_x == 0) { - start = rocshmem_timer(); + start_time[wg_id] = wall_clock64(); } - wg_team_reduce(ctx, team, r_buf, s_buf, size); + wg_team_reduce(ctx, teams[wg_id], r_buf, s_buf, size); rocshmem_ctx_wg_barrier_all(ctx); } __syncthreads(); if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; + end_time[wg_id] = wall_clock64(); } rocshmem_wg_ctx_destroy(&ctx); @@ -115,21 +112,36 @@ TeamReductionTester::TeamReductionTester( : Tester(args), init_buf{f1}, verify_buf{f2} { s_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); r_buf = (T1 *)rocshmem_malloc(args.max_msg_size * sizeof(T1)); + + char* value{nullptr}; + if ((value = getenv("ROCSHMEM_MAX_NUM_TEAMS"))) { + num_teams = atoi(value); + } + + CHECK_HIP(hipMalloc(&team_reduce_world_dup, + sizeof(rocshmem_team_t) * num_teams)); } template TeamReductionTester::~TeamReductionTester() { rocshmem_free(s_buf); rocshmem_free(r_buf); + CHECK_HIP(hipFree(team_reduce_world_dup)); } template void TeamReductionTester::preLaunchKernel() { int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD); - team_reduce_world_dup = ROCSHMEM_TEAM_INVALID; - rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, - &team_reduce_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + team_reduce_world_dup[team_i] = ROCSHMEM_TEAM_INVALID; + rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0, + &team_reduce_world_dup[team_i]); + if (team_reduce_world_dup[team_i] == ROCSHMEM_TEAM_INVALID) { + printf("Team %d is invalid!\n", team_i); + abort(); + } + } } template @@ -138,17 +150,19 @@ void TeamReductionTester::launchKernel(dim3 gridSize, dim3 blockSize, size_t shared_bytes = 0; hipLaunchKernelGGL(HIP_KERNEL_NAME(TeamReductionTest), gridSize, - blockSize, shared_bytes, stream, loop, args.skip, timer, - s_buf, r_buf, size, _type, _shmem_context, - team_reduce_world_dup); + blockSize, shared_bytes, stream, loop, args.skip, + start_time, end_time, s_buf, r_buf, size, _type, + _shmem_context, team_reduce_world_dup); - num_msgs = loop + args.skip; - num_timed_msgs = loop; + num_msgs = (loop + args.skip) * gridSize.x; + num_timed_msgs = loop * gridSize.x; } template void TeamReductionTester::postLaunchKernel() { - rocshmem_team_destroy(team_reduce_world_dup); + for (int team_i = 0; team_i < num_teams; team_i++) { + rocshmem_team_destroy(team_reduce_world_dup[team_i]); + } } template diff --git a/tests/functional_tests/team_reduction_tester.hpp b/tests/functional_tests/team_reduction_tester.hpp index 9b7f946..d373aa3 100644 --- a/tests/functional_tests/team_reduction_tester.hpp +++ b/tests/functional_tests/team_reduction_tester.hpp @@ -28,6 +28,8 @@ #include "tester.hpp" +using namespace rocshmem; + /****************************************************************************** * HOST TESTER CLASS *****************************************************************************/ @@ -60,6 +62,13 @@ class TeamReductionTester : public Tester { std::function init_buf; std::function(const T1 &, const T1 &)> verify_buf; + + /** + * This constant should equal ROCSHMEM_MAX_NUM_TEAMS - 1. + * The default value for the maximum number of teams is 40. + */ + int num_teams = 39; + rocshmem_team_t *team_reduce_world_dup; }; #include "team_reduction_tester.cpp" diff --git a/tests/functional_tests/tester.cpp b/tests/functional_tests/tester.cpp index 18efa2f..4c454aa 100644 --- a/tests/functional_tests/tester.cpp +++ b/tests/functional_tests/tester.cpp @@ -59,13 +59,32 @@ Tester::Tester(TesterArguments args) : args(args) { CHECK_HIP(hipGetDevice(&device_id)); CHECK_HIP(hipGetDeviceProperties(&deviceProps, device_id)); num_warps = (args.wg_size - 1) / deviceProps.warpSize + 1; + wf_size = deviceProps.warpSize; CHECK_HIP(hipStreamCreate(&stream)); CHECK_HIP(hipEventCreate(&start_event)); CHECK_HIP(hipEventCreate(&stop_event)); - CHECK_HIP(hipMalloc((void**)&timer, sizeof(uint64_t) * args.num_wgs)); + CHECK_HIP(hipDeviceGetAttribute(&wall_clk_rate, + hipDeviceAttributeWallClockRate, device_id)); + num_timers = args.num_wgs; + TestType type = (TestType)args.algorithm; + switch (type) { + case WAVEGetTestType: + case WAVEGetNBITestType: + case WAVEPutTestType: + case WAVEPutNBITestType: + num_timers = args.num_wgs * num_warps; + break; + default: + break; + } + CHECK_HIP(hipMalloc((void**)&timer, sizeof(uint64_t) * num_timers)); + CHECK_HIP(hipMalloc((void**)&start_time, sizeof(uint64_t) * num_timers)); + CHECK_HIP(hipMalloc((void**)&end_time, sizeof(uint64_t) * num_timers)); } Tester::~Tester() { + CHECK_HIP(hipFree(end_time)); + CHECK_HIP(hipFree(start_time)); CHECK_HIP(hipFree(timer)); CHECK_HIP(hipEventDestroy(stop_event)); CHECK_HIP(hipEventDestroy(start_event)); @@ -75,11 +94,6 @@ Tester::~Tester() { std::vector Tester::create(TesterArguments args) { int rank = args.myid; std::vector testers; - hipDeviceProp_t deviceProps; - int device_id, numWarps; - CHECK_HIP(hipGetDevice(&device_id)); - CHECK_HIP(hipGetDeviceProperties(&deviceProps, device_id)); - numWarps = (args.wg_size - 1) / deviceProps.warpSize + 1; if (rank == 0) std::cout << "### Creating Test: "; @@ -159,85 +173,38 @@ std::vector Tester::create(TesterArguments args) { if (rank == 0) { std::cout << "Team Broadcast Test ###" << std::endl; } - testers.push_back(new TeamBroadcastTester( - args, - [](long& f1, long& f2) { - f1 = 1; - f2 = 2; - }, - [rank](long v) { - long expected_val; - /** - * The verification routine here requires that the - * PE_root value is 0 which denotes that the - * sending processing element is rank 0. - * - * The difference in expected values arises from - * the specification for broadcast where the - * PE_root processing element does not copy the - * contents from its own source to dest during - * the broadcast. - */ - if (rank == 0) { - expected_val = 2; - } else { - expected_val = 1; - } - - return (v == expected_val) - ? std::make_pair(true, "") - : std::make_pair( - false, "Rank " + std::to_string(rank) + ", Got " + - std::to_string(v) + ", Expect " + - std::to_string(expected_val)); - })); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + testers.push_back(new TeamBroadcastTester(args)); + return testers; case AllToAllTestType: if (rank == 0) { std::cout << "Alltoall Test ###" << std::endl; } - testers.push_back(new AlltoallTester( - args, - [rank](int64_t& f1, int64_t& f2, int64_t dest_pe) { - const long SRC_SHIFT = 16; - // Make value for each src, dst pair unique - // by shifting src by SRC_SHIFT bits - f1 = (rank << SRC_SHIFT) + dest_pe; - f2 = -1; - }, - [rank](int64_t v, int64_t src_pe) { - const long SRC_SHIFT = 16; - // See if we obtained unique value - long expected_val = (src_pe << SRC_SHIFT) + rank; - - return (v == expected_val) - ? std::make_pair(true, "") - : std::make_pair( - false, "Rank " + std::to_string(rank) + ", Got " + - std::to_string(v) + ", Expect " + - std::to_string(expected_val)); - })); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); + testers.push_back(new AlltoallTester(args)); return testers; case FCollectTestType: if (rank == 0) { std::cout << "Fcollect Test ###" << std::endl; } - testers.push_back(new FcollectTester( - args, - [rank](int64_t& f1, int64_t& f2) { - f1 = rank; - f2 = -1; - }, - [rank](int64_t v, int64_t src_pe) { - int64_t expected_val = src_pe; - - return (v == expected_val) - ? std::make_pair(true, "") - : std::make_pair( - false, "Rank " + std::to_string(rank) + ", Got " + - std::to_string(v) + ", Expect " + - std::to_string(expected_val)); - })); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); + testers.push_back(new FcollectTester(args)); return testers; case AMO_FAddTestType: if (rank == 0) std::cout << "AMO Fetch_Add ###" << std::endl; @@ -352,35 +319,22 @@ std::vector Tester::create(TesterArguments args) { testers.push_back(new ShmemPtrTester(args)); return testers; case WGGetTestType: - if (rank == 0) { - if (args.num_wgs > 1) - std::cout << "Tiled Blocking WG level Gets ###" << std::endl; - else std::cout << "Blocking WG level Gets ###" << std::endl; - } + if (rank == 0) + std::cout << "Blocking WG level Gets ###" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGGetNBITestType: - if (rank == 0) { - if (args.num_wgs > 1) - std::cout << "Tiled Non-Blocking WG level Gets ###" << std::endl; - else std::cout << "Non-Blocking WG level Gets ###" << std::endl; - } + if (rank == 0) + std::cout << "Non-Blocking WG level Gets ###" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGPutTestType: - if (rank == 0) { - if (args.num_wgs > 1) - std::cout << "Tiled Blocking WG level Puts ###" << std::endl; - else std::cout << "Blocking WG level Puts ###" << std::endl; - } + if (rank == 0) std::cout << "Blocking WG level Puts ###" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case WGPutNBITestType: - if (rank == 0) { - if (args.num_wgs > 1) - std::cout << "Tiled Non-Blocking WG level Puts ###" << std::endl; - else std::cout << "Non-Blocking WG level Puts ###" << std::endl; - } + if (rank == 0) + std::cout << "Non-Blocking WG level Puts ###" << std::endl; testers.push_back(new ExtendedPrimitiveTester(args)); return testers; case PutNBIMRTestType: @@ -389,35 +343,22 @@ std::vector Tester::create(TesterArguments args) { testers.push_back(new PrimitiveMRTester(args)); return testers; case WAVEGetTestType: - if (rank == 0) { - if (args.num_wgs > 1 || numWarps > 1) - std::cout << "Tiled Blocking WAVE level Gets ###" << std::endl; - else std::cout << "Blocking WAVE level Gets ###" << std::endl; - } + if (rank == 0) + std::cout << "Blocking WAVE level Gets ###" << std::endl; testers.push_back(new WaveLevelPrimitiveTester(args)); return testers; case WAVEGetNBITestType: - if (rank == 0) { - if (args.num_wgs > 1 || numWarps > 1) - std::cout << "Tiled Non-Blocking WAVE level Gets ###" << std::endl; - else std::cout << "Non-Blocking WAVE level Gets ###" << std::endl; - } + if (rank == 0) + std::cout << "Non-Blocking WAVE level Gets ###" << std::endl; testers.push_back(new WaveLevelPrimitiveTester(args)); return testers; case WAVEPutTestType: - if (rank == 0) { - if (args.num_wgs > 1 || numWarps > 1) - std::cout << "Tiled Blocking WAVE level Puts ###" << std::endl; - else std::cout << "Blocking WAVE level Puts ###" << std::endl; - } + if (rank == 0) std::cout << "Blocking WAVE level Puts ###" << std::endl; testers.push_back(new WaveLevelPrimitiveTester(args)); return testers; case WAVEPutNBITestType: - if (rank == 0) { - if (args.num_wgs > 1 || numWarps > 1) - std::cout << "Tiled Non-Blocking WAVE level Puts ###" << std::endl; - else std::cout << "Non-Blocking WAVE level Puts ###" << std::endl; - } + if (rank == 0) + std::cout << "Non-Blocking WAVE level Puts ###" << std::endl; testers.push_back(new WaveLevelPrimitiveTester(args)); return testers; case PutSignalTestType: @@ -527,32 +468,10 @@ void Tester::execute() { // data validation verifyResults(size); - /** - * Adjust size for *_wg and *_wave functions - */ - uint64_t size_ = size; - TestType type = (TestType)args.algorithm; - switch (type) { - case WAVEGetTestType: - case WAVEGetNBITestType: - case WAVEPutTestType: - case WAVEPutNBITestType: - size_ *= (args.num_wgs * num_warps); - break; - case WGGetTestType: - case WGGetNBITestType: - case WGPutTestType: - case WGPutNBITestType: - size_ *= args.num_wgs; - break; - default: - break; - } - barrier(); if (_type != TeamCtxInfraTestType) { - print(size_); + print(size); } } } @@ -583,6 +502,10 @@ void Tester::print(uint64_t size) { return; } + /** + * Calculate total amount of data transfered + */ + uint64_t total_size = size * num_timed_msgs; uint64_t timer_avg = timerAvgInMicroseconds(); double latency_avg = static_cast(timer_avg) / num_timed_msgs; double avg_msg_rate = num_timed_msgs / (timer_avg / 1e6); @@ -590,26 +513,32 @@ void Tester::print(uint64_t size) { float total_kern_time_ms; CHECK_HIP(hipEventElapsedTime(&total_kern_time_ms, start_event, stop_event)); float total_kern_time_s = total_kern_time_ms / 1000; + + double time_ms = gpuCyclesToMicroseconds(max_end_time - min_start_time); + double time_s = time_ms / 1e6; double bandwidth_avg_gbs = - num_msgs * size * bw_factor / total_kern_time_s / pow(2, 30); + static_cast(total_size * bw_factor) / time_s / pow(2, 30); int field_width = 20; int float_precision = 2; if (_print_header) { - printf("%-*s%*s%*s%*s", - 10, "# Size (B)", + printf("%-*s%-*s%*s%*s%*s", + 15, "# Size (B)", + 15, "# of timed Msgs", field_width, "Latency (us)", field_width, "Bandwidth (GB/s)", field_width + 1, "Msg Rate (Msg/s)\n"); + _print_header = 0; } - printf("%-*lu%*.*f%*.*f%*.*f\n", - 10, size, - field_width, float_precision, latency_avg, - field_width, float_precision, bandwidth_avg_gbs, - field_width, float_precision, avg_msg_rate); + printf("%-*lu%-*d%*.*f%*.*f%*.*f\n", + 15, size, + 15, num_timed_msgs, + field_width, float_precision, latency_avg, + field_width, float_precision, bandwidth_avg_gbs, + field_width, float_precision, avg_msg_rate); fflush(stdout); } @@ -628,33 +557,25 @@ void Tester::barrier() { flush_hdp(); } -uint64_t Tester::gpuCyclesToMicroseconds(uint64_t cycles) { - /** - * The dGPU asm core timer runs at 27MHz. This is different from the - * core clock returned by HIP. For an APU, this is different and might - * need adjusting. - */ - uint64_t gpu_frequency_MHz = 27; - - /** - * hipDeviceGetAttribute(&gpu_frequency_khz, - * hipDeviceAttributeClockRate, - * 0); - */ - - return cycles / gpu_frequency_MHz; +double Tester::gpuCyclesToMicroseconds(uint64_t cycles) { + return static_cast(cycles) / (wall_clk_rate * 1e-3); } -uint64_t Tester::timerAvgInMicroseconds() { - uint64_t sum = 0; +double Tester::timerAvgInMicroseconds() { + double sum = 0; + min_start_time = 0xFFFFFFFFFFFFFFFF; + max_end_time = 0; - /** - * TODO: (bpotter/avinash) Modify the calcuation for the Tiled version of - * puts and gets at wavefront level - */ - for (uint64_t i = 0; i < args.num_wgs; i++) { + for (uint64_t i = 0; i < num_timers; i++) { + timer[i] = end_time[i] - start_time[i]; sum += gpuCyclesToMicroseconds(timer[i]); + min_start_time = (start_time[i] < min_start_time) + ? start_time[i] + : min_start_time; + max_end_time = (end_time[i] > max_end_time) + ? end_time[i] + : max_end_time; } - return sum / args.num_wgs; + return sum / num_timers; } diff --git a/tests/functional_tests/tester.hpp b/tests/functional_tests/tester.hpp index f506441..ce89cee 100644 --- a/tests/functional_tests/tester.hpp +++ b/tests/functional_tests/tester.hpp @@ -27,6 +27,7 @@ #include #include "tester_arguments.hpp" +#include "../src/util.hpp" /****************************************************************************** * TESTER CLASS TYPES @@ -126,6 +127,8 @@ class Tester { int num_warps = 0; int bw_factor = 1; int device_id = 0; + int wall_clk_rate = 0; //in kilohertz + int wf_size = 0; TesterArguments args; @@ -136,6 +139,11 @@ class Tester { hipDeviceProp_t deviceProps; uint64_t *timer = nullptr; + uint64_t *start_time = nullptr; + uint64_t *end_time = nullptr; + uint64_t min_start_time = 0; + uint64_t max_end_time = 0; + uint32_t num_timers = 0; private: bool _print_header = 1; @@ -143,9 +151,9 @@ class Tester { void barrier(); - uint64_t gpuCyclesToMicroseconds(uint64_t cycles); + double gpuCyclesToMicroseconds(uint64_t cycles); - uint64_t timerAvgInMicroseconds(); + double timerAvgInMicroseconds(); bool peLaunchesKernel(); diff --git a/tests/functional_tests/tester_arguments.cpp b/tests/functional_tests/tester_arguments.cpp index a9fe9d0..a39e7ad 100644 --- a/tests/functional_tests/tester_arguments.cpp +++ b/tests/functional_tests/tester_arguments.cpp @@ -97,6 +97,12 @@ TesterArguments::TesterArguments(int argc, char *argv[]) { case RandomAccessTestType: min_msg_size = 4; break; + case FCollectTestType: + case AllToAllTestType: + case TeamBroadcastTestType: + case TeamReductionTestType: + min_msg_size = 8; + break; case TeamCtxInfraTestType: max_msg_size = min_msg_size; break; diff --git a/tests/functional_tests/wave_level_primitives.cpp b/tests/functional_tests/wave_level_primitives.cpp index c434fae..61365f5 100644 --- a/tests/functional_tests/wave_level_primitives.cpp +++ b/tests/functional_tests/wave_level_primitives.cpp @@ -31,7 +31,8 @@ using namespace rocshmem; /****************************************************************************** * DEVICE TEST KERNEL *****************************************************************************/ -__global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, +__global__ void WaveLevelPrimitiveTest(int loop, int skip, + uint64_t *start_time, uint64_t *end_time, char *s_buf, char *r_buf, int size, TestType type, ShmemContextType ctx_type, int wf_size) { @@ -39,20 +40,17 @@ __global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, rocshmem_wg_init(); rocshmem_wg_ctx_create(ctx_type, &ctx); - /** - * Calculate start index for each wavefront for tiled version - * If the number of wavefronts is greater than 1, this kernel performs a - * tiled functional test - */ - uint64_t start; + // Calculate start index for each wavefront int wf_id = get_flat_block_id() / wf_size; - int wg_offset = size * get_flat_grid_id() * (get_flat_block_size() / wf_size); - int idx = wf_id * size + wg_offset; - s_buf += idx; - r_buf += idx; + int wg_offset = get_flat_grid_id() * + ((get_flat_block_size() - 1 ) / wf_size + 1); + int idx = wf_id + wg_offset; + int offset = size * idx; + s_buf += offset; + r_buf += offset; for (int i = 0; i < loop + skip; i++) { - if (i == skip) start = rocshmem_timer(); + if (i == skip) start_time[idx] = wall_clock64(); switch (type) { case WAVEGetTestType: @@ -74,9 +72,7 @@ __global__ void WaveLevelPrimitiveTest(int loop, int skip, uint64_t *timer, rocshmem_ctx_quiet(ctx); - if (hipThreadIdx_x == 0) { - timer[hipBlockIdx_x] = rocshmem_timer() - start; - } + end_time[idx] = wall_clock64(); rocshmem_wg_ctx_destroy(&ctx); rocshmem_wg_finalize(); @@ -109,8 +105,8 @@ void WaveLevelPrimitiveTester::launchKernel(dim3 gridSize, dim3 blockSize, size_t shared_bytes = 0; hipLaunchKernelGGL(WaveLevelPrimitiveTest, gridSize, blockSize, shared_bytes, - stream, loop, args.skip, timer, (char*)s_buf, - (char*)r_buf, size, _type, _shmem_context, + stream, loop, args.skip, start_time, end_time, + (char*)s_buf, (char*)r_buf, size, _type, _shmem_context, deviceProps.warpSize); num_msgs = (loop + args.skip) * gridSize.x * num_warps; diff --git a/tests/functional_tests/wave_level_primitives.hpp b/tests/functional_tests/wave_level_primitives.hpp index af65ac0..47ef72d 100644 --- a/tests/functional_tests/wave_level_primitives.hpp +++ b/tests/functional_tests/wave_level_primitives.hpp @@ -24,7 +24,6 @@ #define _WAVE_LEVEL_PRIMITIVE_TEST_HPP_ #include "tester.hpp" -#include "../src/util.hpp" /****************************************************************************** * HOST TESTER CLASS