From 633e5a27c7e861e536a4e896dc865482792dd8f6 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 24 Feb 2023 17:28:42 -0600 Subject: [PATCH] SWDEV-385653 - Disable rocmtools Temporarily disable rocmtools because of hsa_shut_down issues Change-Id: I5e8b6729b8200ccdd5c399862bfc632ba69f884c Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 2 +- README.md | 17 ----------------- cmake_modules/rdc-backward-compat.cmake | 4 +++- common/rdc_field.data | 7 +++---- include/rdc_lib/impl/RdcModuleMgrImpl.h | 2 -- include/rdc_lib/impl/RdcTelemetryModule.h | 4 +--- rdc_libs/rdc/src/RdcModuleMgrImpl.cc | 7 +------ rdc_libs/rdc/src/RdcTelemetryModule.cc | 6 +----- tests/rdc_tests/CMakeLists.txt | 5 ++++- 9 files changed, 14 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af8b85c..0525696 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON) # When cmake -DBUILD_ROCPTEST=off, it will not build the librdc_rocp.so # which requires the Rocm profiler. -option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" ON) +option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" OFF) # When cmake -DBUILD_TESTS=off, it will not build RDC tests. option(BUILD_TESTS "Build test suite" OFF) diff --git a/README.md b/README.md index 6810a67..ff96557 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,6 @@ RDC can run on AMD ROCm supported platforms, please refer to the **List of Suppo Latex (pdfTeX 3.14159265-2.6-1.40.16) ## required to build the latest documentation gRPC and protoc ## required for communication libcap-dev ## required to manage the privileges. - rocmtools ## required for profiler metrics AMD ROCm platform (https://github.com/RadeonOpenCompute/ROCm) * It is recommended to install the complete AMD ROCm platform. @@ -31,8 +30,6 @@ RDC can run on AMD ROCm supported platforms, please refer to the **List of Suppo * At the minimum, these two components are required (i) AMD ROCm SMI Library (https://github.com/RadeonOpenCompute/rocm_smi_lib) (ii) AMD ROCk Kernel driver (https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver) - * For profiler metrics, this component is required: - (i) AMD ROCm Tools (https://github.com/ROCm-Developer-Tools/rocmtools) ## Building gRPC and protoc **NOTE:** gRPC and protoc compiler must be built when building RDC from source as pre-built packages are not available. When installing RDC from a package, gRPC and protoc will be installed from the package. @@ -157,10 +154,6 @@ cd /opt/rocm/rdc/bin ./rdci dmon -u --list-all ## list all GPU counters ./rdci dmon -u -i 0 -c 1 -e 100 ## monitor field 100 on gpu 0 for count of 1 ./rdci dmon -u -i 0 -c 1 -e 1,2 ## monitor fields 1,2 on gpu 0 for count of 1 -# below requires rocmtools to be installed -./rdci dmon -u -i 0 -c 5 -e 700 ## monitor field 700 on gpu 0 for count of 5 -# below is only likely to work on MI series GPUs -./rdci dmon -u -i 0 -c 5 -e 700,701,702 ## monitor fields 700,701,702 ``` ## Troubleshooting rdcd @@ -181,13 +174,3 @@ RDC_LOG=DEBUG /opt/rocm/rdc/bin/rdcd RDC_LOG=DEBUG also works on rdci ERROR, INFO, DEBUG logging levels are supported - -- Reading `RDC_FI_PROF_*` crashes rdcd -- All `RDC_FI_PROF_*` metrics return N/A - - 0. ROCMTools support is in beta. - Reading registers beyond 700-702 range is not guaranteed to work. - 1. Does your GPU support selected fields? - Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs. - Others are mostly intended for MI series. - 2. Is rocmtools installed? Can you find `librocmtools.so`? diff --git a/cmake_modules/rdc-backward-compat.cmake b/cmake_modules/rdc-backward-compat.cmake index 25ba07c..58495f0 100644 --- a/cmake_modules/rdc-backward-compat.cmake +++ b/cmake_modules/rdc-backward-compat.cmake @@ -62,7 +62,9 @@ function(create_library_symlink) set(LIB_RDC_RAS "librdc_ras.so") set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so") set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" ) - set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" ) + if(BUILD_ROCPTEST) + set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" ) + endif() set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" ) set(library_files "${library_files}" "${LIB_RDC_RAS}") diff --git a/common/rdc_field.data b/common/rdc_field.data index 6985f73..668a704 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -81,10 +81,9 @@ FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", // This doesn't map to rocprofiler counters directly // See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h // See metrics.xml in rocmtools -// TODO: uncomment rest of the fields when implemented -FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false) -FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false) +//FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false) +//FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false) +//FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false) //FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "PROF_CU_OCCUPANCY", false) //FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "PROF_CU_UTILIZATION", false) //FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "PROF_FETCH_SIZE", false) diff --git a/include/rdc_lib/impl/RdcModuleMgrImpl.h b/include/rdc_lib/impl/RdcModuleMgrImpl.h index 0f76272..fe5c625 100644 --- a/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -28,7 +28,6 @@ THE SOFTWARE. #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" -#include "rdc_lib/impl/RdcRocpLib.h" #include "rdc_lib/impl/RdcRocrLib.h" #include "rdc_lib/impl/RdcSmiLib.h" @@ -51,7 +50,6 @@ class RdcModuleMgrImpl : public RdcModuleMgr { RdcSmiLibPtr smi_lib_; RdcMetricFetcherPtr fetcher_; RdcRocrLibPtr rocr_lib_; - RdcRocpLibPtr rocp_lib_; }; } // namespace rdc diff --git a/include/rdc_lib/impl/RdcTelemetryModule.h b/include/rdc_lib/impl/RdcTelemetryModule.h index 59d66ce..c9255d6 100644 --- a/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/include/rdc_lib/impl/RdcTelemetryModule.h @@ -29,7 +29,6 @@ THE SOFTWARE. #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" #include "rdc_lib/impl/RdcSmiLib.h" -#include "rdc_lib/impl/RdcRocpLib.h" #include "rdc_lib/RdcMetricFetcher.h" namespace amd { @@ -51,8 +50,7 @@ class RdcTelemetryModule : public RdcTelemetry { uint32_t fields_count); RdcTelemetryModule(const RdcSmiLibPtr& smi_lib, - const RdcRasLibPtr& ras_module, - const RdcRocpLibPtr& rocp_module); + const RdcRasLibPtr& ras_module); private: //< Helper function to dispatch fields to module diff --git a/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc index b257637..c61e517 100644 --- a/rdc_libs/rdc/src/RdcModuleMgrImpl.cc +++ b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -23,7 +23,6 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcDiagnosticModule.h" #include "rdc_lib/impl/RdcRasLib.h" -#include "rdc_lib/impl/RdcRocpLib.h" #include "rdc_lib/impl/RdcRocrLib.h" #include "rdc_lib/impl/RdcTelemetryModule.h" @@ -45,13 +44,9 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() { ras_lib_.reset(new RdcRasLib("librdc_ras.so")); } - if (!rocp_lib_) { - rocp_lib_.reset(new RdcRocpLib("librdc_rocp.so")); - } - if (!rdc_telemetry_module_) { rdc_telemetry_module_.reset( - new RdcTelemetryModule(smi_lib_, ras_lib_, rocp_lib_)); + new RdcTelemetryModule(smi_lib_, ras_lib_)); } return rdc_telemetry_module_; diff --git a/rdc_libs/rdc/src/RdcTelemetryModule.cc b/rdc_libs/rdc/src/RdcTelemetryModule.cc index 264c224..e89f503 100644 --- a/rdc_libs/rdc/src/RdcTelemetryModule.cc +++ b/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -94,15 +94,11 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( RdcTelemetryModule::RdcTelemetryModule( const RdcSmiLibPtr& smi_lib, - const RdcRasLibPtr& ras_module, - const RdcRocpLibPtr& rocp_module) { + const RdcRasLibPtr& ras_module) { telemetry_modules_.push_back(smi_lib); if (ras_module) { telemetry_modules_.push_back(ras_module); } - if (rocp_module) { - telemetry_modules_.push_back(rocp_module); - } auto ite = telemetry_modules_.begin(); for (; ite != telemetry_modules_.end(); ite++) { diff --git a/tests/rdc_tests/CMakeLists.txt b/tests/rdc_tests/CMakeLists.txt index 09aa466..74a73a8 100755 --- a/tests/rdc_tests/CMakeLists.txt +++ b/tests/rdc_tests/CMakeLists.txt @@ -87,11 +87,14 @@ target_link_libraries(${RDCTST} PUBLIC rdc_bootstrap PUBLIC rdc PUBLIC rdc_ras - PUBLIC rdc_rocp PUBLIC c PUBLIC stdc++ PUBLIC pthread) +if(BUILD_ROCPTEST) + target_link_libraries(${RDCTST} PUBLIC rdc_rocp) +endif() + install(TARGETS ${RDCTST} DESTINATION ${RDC_SHARE_INSTALL_PREFIX}/rdctst_tests COMPONENT ${TESTS_COMPONENT})