Skip to content

Commit

Permalink
Fallback to junction temperature and socket power
Browse files Browse the repository at this point in the history
If the card does not have edge temperature, fallback to junction
temperature. If the card only have socket power, then use socket
power instead.

Change-Id: I053a67a89cf3b29a34e82123f522c08d7dd68916
  • Loading branch information
bill-shuzhou-liu committed Feb 5, 2024
1 parent adf0d70 commit 5cfe2b4
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 3 deletions.
2 changes: 1 addition & 1 deletion python_binding/rdc_collectd.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def config_func(config):
if key == 'field_ids':
field_ids = []
for f in node.values:
field_id = rdc.get_field_id_from_name(f)
field_id = rdc.get_field_id_from_name(str.encode(f))
if field_id.value == rdc_field_t.RDC_FI_INVALID:
print("Invalid field '%s' will be ignored." % (f))
else:
Expand Down
2 changes: 1 addition & 1 deletion python_binding/rdc_prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def get_field_ids(args):

if len(field_id_str)> 0 :
for f in field_id_str:
field_id = rdc.get_field_id_from_name(f)
field_id = rdc.get_field_id_from_name(str.encode(f))
if field_id.value == rdc_field_t.RDC_FI_INVALID:
print("Invalid field '%s' will be ignored." % (f))
else:
Expand Down
14 changes: 13 additions & 1 deletion rdc_libs/rdc/src/RdcMetricFetcherImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
break;
case RDC_FI_POWER_USAGE:
value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64);
{
RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER;
// below call should handle both socket power and regular power
value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
}
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
Expand Down Expand Up @@ -404,6 +408,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64);

// fallback to hotspot temperature as some card may not have edge temperature.
if (sensor_type == RSMI_TEMP_TYPE_EDGE
&& value->status == RSMI_STATUS_NOT_SUPPORTED) {
sensor_type = RSMI_TEMP_TYPE_JUNCTION;
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type,
RSMI_TEMP_CURRENT, &val_i64);
}

value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
Expand Down

0 comments on commit 5cfe2b4

Please sign in to comment.