Skip to content

Commit

Permalink
SWDEV-385653 - Disable rocmtools
Browse files Browse the repository at this point in the history
Temporarily disable rocmtools because of hsa_shut_down issues

Change-Id: I5e8b6729b8200ccdd5c399862bfc632ba69f884c
Signed-off-by: Galantsev, Dmitrii <[email protected]>
  • Loading branch information
dmitrii-galantsev committed Feb 27, 2023
1 parent 81f3126 commit 633e5a2
Show file tree
Hide file tree
Showing 9 changed files with 14 additions and 40 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON)

# When cmake -DBUILD_ROCPTEST=off, it will not build the librdc_rocp.so
# which requires the Rocm profiler.
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" ON)
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" OFF)

# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
option(BUILD_TESTS "Build test suite" OFF)
Expand Down
17 changes: 0 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,13 @@ RDC can run on AMD ROCm supported platforms, please refer to the **List of Suppo
Latex (pdfTeX 3.14159265-2.6-1.40.16) ## required to build the latest documentation
gRPC and protoc ## required for communication
libcap-dev ## required to manage the privileges.
rocmtools ## required for profiler metrics

AMD ROCm platform (https://github.com/RadeonOpenCompute/ROCm)
* It is recommended to install the complete AMD ROCm platform.
For installation instruction see https://docs.amd.com/category/Release%20Documentation
* At the minimum, these two components are required
(i) AMD ROCm SMI Library (https://github.com/RadeonOpenCompute/rocm_smi_lib)
(ii) AMD ROCk Kernel driver (https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver)
* For profiler metrics, this component is required:
(i) AMD ROCm Tools (https://github.com/ROCm-Developer-Tools/rocmtools)

## Building gRPC and protoc
**NOTE:** gRPC and protoc compiler must be built when building RDC from source as pre-built packages are not available. When installing RDC from a package, gRPC and protoc will be installed from the package.
Expand Down Expand Up @@ -157,10 +154,6 @@ cd /opt/rocm/rdc/bin
./rdci dmon -u --list-all ## list all GPU counters
./rdci dmon -u -i 0 -c 1 -e 100 ## monitor field 100 on gpu 0 for count of 1
./rdci dmon -u -i 0 -c 1 -e 1,2 ## monitor fields 1,2 on gpu 0 for count of 1
# below requires rocmtools to be installed
./rdci dmon -u -i 0 -c 5 -e 700 ## monitor field 700 on gpu 0 for count of 5
# below is only likely to work on MI series GPUs
./rdci dmon -u -i 0 -c 5 -e 700,701,702 ## monitor fields 700,701,702
```

## Troubleshooting rdcd
Expand All @@ -181,13 +174,3 @@ RDC_LOG=DEBUG /opt/rocm/rdc/bin/rdcd
RDC_LOG=DEBUG also works on rdci

ERROR, INFO, DEBUG logging levels are supported

- Reading `RDC_FI_PROF_*` crashes rdcd
- All `RDC_FI_PROF_*` metrics return N/A

0. ROCMTools support is in beta.
Reading registers beyond 700-702 range is not guaranteed to work.
1. Does your GPU support selected fields?
Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs.
Others are mostly intended for MI series.
2. Is rocmtools installed? Can you find `librocmtools.so`?
4 changes: 3 additions & 1 deletion cmake_modules/rdc-backward-compat.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ function(create_library_symlink)
set(LIB_RDC_RAS "librdc_ras.so")
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
if(BUILD_ROCPTEST)
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
endif()
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
set(library_files "${library_files}" "${LIB_RDC_RAS}")

Expand Down
7 changes: 3 additions & 4 deletions common/rdc_field.data
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,9 @@ FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection",
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocmtools
// TODO: uncomment rest of the fields when implemented
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false)
//FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false)
//FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false)
//FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false)
//FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "PROF_CU_OCCUPANCY", false)
//FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "PROF_CU_UTILIZATION", false)
//FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "PROF_FETCH_SIZE", false)
Expand Down
2 changes: 0 additions & 2 deletions include/rdc_lib/impl/RdcModuleMgrImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ THE SOFTWARE.
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"

Expand All @@ -51,7 +50,6 @@ class RdcModuleMgrImpl : public RdcModuleMgr {
RdcSmiLibPtr smi_lib_;
RdcMetricFetcherPtr fetcher_;
RdcRocrLibPtr rocr_lib_;
RdcRocpLibPtr rocp_lib_;
};

} // namespace rdc
Expand Down
4 changes: 1 addition & 3 deletions include/rdc_lib/impl/RdcTelemetryModule.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ THE SOFTWARE.
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/RdcMetricFetcher.h"

namespace amd {
Expand All @@ -51,8 +50,7 @@ class RdcTelemetryModule : public RdcTelemetry {
uint32_t fields_count);

RdcTelemetryModule(const RdcSmiLibPtr& smi_lib,
const RdcRasLibPtr& ras_module,
const RdcRocpLibPtr& rocp_module);
const RdcRasLibPtr& ras_module);

private:
//< Helper function to dispatch fields to module
Expand Down
7 changes: 1 addition & 6 deletions rdc_libs/rdc/src/RdcModuleMgrImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ THE SOFTWARE.

#include "rdc_lib/impl/RdcDiagnosticModule.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"

Expand All @@ -45,13 +44,9 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
ras_lib_.reset(new RdcRasLib("librdc_ras.so"));
}

if (!rocp_lib_) {
rocp_lib_.reset(new RdcRocpLib("librdc_rocp.so"));
}

if (!rdc_telemetry_module_) {
rdc_telemetry_module_.reset(
new RdcTelemetryModule(smi_lib_, ras_lib_, rocp_lib_));
new RdcTelemetryModule(smi_lib_, ras_lib_));
}

return rdc_telemetry_module_;
Expand Down
6 changes: 1 addition & 5 deletions rdc_libs/rdc/src/RdcTelemetryModule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,11 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(

RdcTelemetryModule::RdcTelemetryModule(
const RdcSmiLibPtr& smi_lib,
const RdcRasLibPtr& ras_module,
const RdcRocpLibPtr& rocp_module) {
const RdcRasLibPtr& ras_module) {
telemetry_modules_.push_back(smi_lib);
if (ras_module) {
telemetry_modules_.push_back(ras_module);
}
if (rocp_module) {
telemetry_modules_.push_back(rocp_module);
}

auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
Expand Down
5 changes: 4 additions & 1 deletion tests/rdc_tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,14 @@ target_link_libraries(${RDCTST}
PUBLIC rdc_bootstrap
PUBLIC rdc
PUBLIC rdc_ras
PUBLIC rdc_rocp
PUBLIC c
PUBLIC stdc++
PUBLIC pthread)

if(BUILD_ROCPTEST)
target_link_libraries(${RDCTST} PUBLIC rdc_rocp)
endif()

install(TARGETS ${RDCTST}
DESTINATION ${RDC_SHARE_INSTALL_PREFIX}/rdctst_tests
COMPONENT ${TESTS_COMPONENT})
Expand Down

0 comments on commit 633e5a2

Please sign in to comment.