From 5cfe2b41690441f2eb12a8406ca169619250baf9 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 5 Feb 2024 10:06:31 -0600 Subject: [PATCH] Fallback to junction temperature and socket power If the card does not have edge temperature, fallback to junction temperature. If the card only have socket power, then use socket power instead. Change-Id: I053a67a89cf3b29a34e82123f522c08d7dd68916 --- python_binding/rdc_collectd.py | 2 +- python_binding/rdc_prometheus.py | 2 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 14 +++++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python_binding/rdc_collectd.py b/python_binding/rdc_collectd.py index 346dce8..9ce8089 100644 --- a/python_binding/rdc_collectd.py +++ b/python_binding/rdc_collectd.py @@ -65,7 +65,7 @@ def config_func(config): if key == 'field_ids': field_ids = [] for f in node.values: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/python_binding/rdc_prometheus.py b/python_binding/rdc_prometheus.py index 77975b2..fe486c5 100644 --- a/python_binding/rdc_prometheus.py +++ b/python_binding/rdc_prometheus.py @@ -85,7 +85,7 @@ def get_field_ids(args): if len(field_id_str)> 0 : for f in field_id_str: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index fbbccdd..6f48376 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -364,12 +364,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; case RDC_FI_POWER_USAGE: - value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64); + { + RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER; + // below call should handle both socket power and regular power + value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type); value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = static_cast(i64); } break; + } case RDC_FI_GPU_CLOCK: case RDC_FI_MEM_CLOCK: rsmi_frequencies_t f; @@ -404,6 +408,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64); + // fallback to hotspot temperature as some card may not have edge temperature. + if (sensor_type == RSMI_TEMP_TYPE_EDGE + && value->status == RSMI_STATUS_NOT_SUPPORTED) { + sensor_type = RSMI_TEMP_TYPE_JUNCTION; + value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, + RSMI_TEMP_CURRENT, &val_i64); + } + value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = val_i64;