Skip to content

Commit

Permalink
Add support for junction, edge and memory temperature sensors (#42)
Browse files Browse the repository at this point in the history
* If vendor/device/subsystem name is not found, use device ID string

* Update documentation for get-name functions

* Add support for junction, edge and memory temperature sensors
  • Loading branch information
cfreehill authored May 24, 2019
1 parent 59538cd commit 11f7143
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 42 deletions.
Binary file modified docs/ROCm_SMI_Manual.pdf
Binary file not shown.
29 changes: 22 additions & 7 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,21 @@ typedef enum {
typedef rsmi_temperature_metric_t rsmi_temperature_metric;
/// \endcond

/**
* @brief This ennumeration is used to indicate from which part of the device a
* temperature reading should be obtained.
*/
typedef enum {
RSMI_TEMP_TYPE_FIRST = 0,

RSMI_TEMP_TYPE_EDGE = RSMI_TEMP_TYPE_FIRST, //!< Edge GPU temperature
RSMI_TEMP_TYPE_JUNCTION, //!< Junction/hotspot
//!< temperature
RSMI_TEMP_TYPE_MEMORY, //!< VRAM temperature

RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY
} rsmi_temperature_type_t;

/**
* @brief Pre-set Profile Selections. These bitmasks can be AND'd with the
* ::rsmi_power_profile_status_t.available_profiles returned from
Expand Down Expand Up @@ -1096,15 +1111,15 @@ rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind,
* @brief Get the temperature metric value for the specified metric, from the
* specified temperature sensor on the specified device.
*
* @details Given a device index @p dv_ind, a 0-based sensor index
* @p sensor_ind, a ::rsmi_temperature_metric_t @p metric and a pointer to an
* int64_t @p temperature, this function will write the value of the metric
* indicated by @p metric to the memory location @p temperature.
* @details Given a device index @p dv_ind, a sensor type @p sensor_type, a
* ::rsmi_temperature_metric_t @p metric and a pointer to an int64_t @p
* temperature, this function will write the value of the metric indicated by
* @p metric and @p sensor_type to the memory location @p temperature.
*
* @param[in] dv_ind a device index
*
* @param[in] sensor_ind a 0-based sensor index. Normally, this will be 0.
* If a device has more than one sensor, it could be greater than 0.
* @param[in] sensor_type part of device from which temperature should be
* obtained. This should come from the enum ::rsmi_temperature_type_t
*
* @param[in] metric enum indicated which temperature value should be
* retrieved
Expand All @@ -1115,7 +1130,7 @@ rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind,
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,
rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t *temperature);
/** @} */ // end of PhysQuer

Expand Down
7 changes: 7 additions & 0 deletions include/rocm_smi/rocm_smi_monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@

#include <string>
#include <cstdint>
#include <map>

#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"

namespace amd {
namespace smi {
Expand Down Expand Up @@ -77,6 +79,7 @@ enum MonitorTypes {
kMonTempOffset,
kMonTempLowest,
kMonTempHighest,
kMonTempLabel,

kMonInvalid = 0xFFFFFFFF,
};
Expand All @@ -89,10 +92,14 @@ class Monitor {
const std::string path(void) const {return path_;}
int readMonitor(MonitorTypes type, uint32_t sensor_ind, std::string *val);
int writeMonitor(MonitorTypes type, uint32_t sensor_ind, std::string val);
uint32_t setSensorLabelMap(void);
uint32_t getSensorIndex(rsmi_temperature_type_t type);

private:
std::string MakeMonitorPath(MonitorTypes type, int32_t sensor_id);
std::string path_;
const RocmSMI_env_vars *env_;
std::map<rsmi_temperature_type_t, uint32_t> temp_type_index_map_;
};

} // namespace smi
Expand Down
31 changes: 17 additions & 14 deletions src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1180,8 +1180,7 @@ get_id_name_str_from_line(uint64_t id, std::string ln,
return ret_str;
}

static rsmi_status_t get_backup_name(uint16_t id, char *name,
size_t len, eNameStrType typ) {
static rsmi_status_t get_backup_name(uint16_t id, char *name, size_t len) {
std::string name_str;

name_str += "0x";
Expand Down Expand Up @@ -1291,7 +1290,7 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name,
val_str.clear();

return get_backup_name(typ == NAME_STR_DEVICE ?
device_id : subsys_id, name, len, typ);
device_id : subsys_id, name, len);
}

val_str = get_id_name_str_from_line(vendor_id, ln, &ln_str);
Expand All @@ -1315,7 +1314,7 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name,
// We should have already returned if we were looking for
// device or subdivce
assert(typ == NAME_STR_VENDOR);
return get_backup_name(vendor_id, name, len, typ);
return get_backup_name(vendor_id, name, len);
}
size_t ct = val_str.copy(name, len);

Expand Down Expand Up @@ -1467,7 +1466,7 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
}

rsmi_status_t
rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,
rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t *temperature) {
TRY

Expand All @@ -1478,14 +1477,6 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,
rsmi_status_t ret;
amd::smi::MonitorTypes mon_type;


// Make any adjustments to sensor_ind here, if index is not a 0 based. For
// rocm_smi we are using a 0-based index. However, most of the Linux sysfs
// monitor files are 1-based, so we will increment by 1 and make adjustments
// for exceptions later.
// See https://www.kernel.org/doc/Documentation/hwmon/sysfs-interface
++sensor_ind;

switch (metric) {
case RSMI_TEMP_CURRENT:
mon_type = amd::smi::kMonTemp;
Expand Down Expand Up @@ -1535,7 +1526,19 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,

DEVICE_MUTEX

ret = get_dev_mon_value(mon_type, dv_ind, sensor_ind, temperature);
GET_DEV_FROM_INDX

assert(dev->monitor() != nullptr);
std::shared_ptr<amd::smi::Monitor> m = dev->monitor();

uint32_t err = m->setSensorLabelMap();
if (err) {
return errno_to_rsmi_status(err);
}

uint32_t sensor_index =
m->getSensorIndex(static_cast<rsmi_temperature_type_t>(sensor_type));
ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature);

return ret;
CATCH
Expand Down
46 changes: 46 additions & 0 deletions src/rocm_smi_monitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ static const char *kMonTempCritMinHystName = "temp#_lcrit_hyst";
static const char *kMonTempOffsetName = "temp#_offset";
static const char *kMonTempLowestName = "temp#_lowest";
static const char *kMonTempHighestName = "temp#_highest";
static const char *kMonTempLabelName = "temp#_label";

static const char *kTempSensorTypeMemoryName = "mem";
static const char *kTempSensorTypeJunctionName = "junction";
static const char *kTempSensorTypeEdgeName = "edge";

static const std::map<std::string, rsmi_temperature_type_t>
kTempSensorNameMap = {
{kTempSensorTypeMemoryName, RSMI_TEMP_TYPE_MEMORY},
{kTempSensorTypeJunctionName, RSMI_TEMP_TYPE_JUNCTION},
{kTempSensorTypeEdgeName, RSMI_TEMP_TYPE_EDGE},
};

static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
{kMonName, kMonNameFName},
Expand All @@ -111,6 +123,7 @@ static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
{kMonTempOffset, kMonTempOffsetName},
{kMonTempLowest, kMonTempLowestName},
{kMonTempHighest, kMonTempHighestName},
{kMonTempLabel, kMonTempLabelName},
};

Monitor::Monitor(std::string path, RocmSMI_env_vars const *e) :
Expand Down Expand Up @@ -152,6 +165,39 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
return ReadSysfsStr(sysfs_path, val);
}

uint32_t
Monitor::setSensorLabelMap(void) {
std::string type_str;
int ret;

if (temp_type_index_map_.size() > 0) {
return 0; // We've already filled in the map
}
auto add_temp_sensor_entry = [&](uint32_t file_index) {
ret = readMonitor(kMonTempLabel, file_index, &type_str);
if (ret) {
return ret;
}

rsmi_temperature_type_t t_type = kTempSensorNameMap.at(type_str);
temp_type_index_map_.insert({t_type, file_index});
return 0;
};

for (uint32_t i = 1; i <= 3; ++i) {
ret = add_temp_sensor_entry(i);
if (ret) {
return ret;
}
}
return 0;
}

uint32_t
Monitor::getSensorIndex(rsmi_temperature_type_t type) {
return temp_type_index_map_.at(type);
}


} // namespace smi
} // namespace amd
57 changes: 36 additions & 21 deletions tests/rocm_smi_test/functional/temp_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,19 @@

#include <iostream>
#include <string>
#include <map>

#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/temp_read.h"
#include "rocm_smi_test/test_common.h"


static const std::map<uint32_t, std::string> kTempSensorNameMap = {
{RSMI_TEMP_TYPE_MEMORY, "Memory"},
{RSMI_TEMP_TYPE_JUNCTION, "Junction"},
{RSMI_TEMP_TYPE_EDGE, "Edge"},
};
TestTempRead::TestTempRead() : TestBase() {
set_title("RSMI Temp Read Test");
set_description("The Temperature Read tests verifies that the temperature "
Expand Down Expand Up @@ -91,12 +98,14 @@ void TestTempRead::Run(void) {

TestBase::Run();

uint32_t type;

for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(i);

auto print_temp_metric = [&](rsmi_temperature_metric_t met,
std::string label) {
err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64);
err = rsmi_dev_temp_metric_get(i, type, met, &val_i64);

if (err != RSMI_STATUS_SUCCESS) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
Expand All @@ -115,25 +124,31 @@ void TestTempRead::Run(void) {
"C" << std::endl;
}
};
print_temp_metric(RSMI_TEMP_CURRENT, "Current Temp.");
print_temp_metric(RSMI_TEMP_MAX, "Temperature max value");
print_temp_metric(RSMI_TEMP_MIN, "Temperature min value");
print_temp_metric(RSMI_TEMP_MAX_HYST,
"Temperature hysteresis value for max limit");
print_temp_metric(RSMI_TEMP_MIN_HYST,
"Temperature hysteresis value for min limit");
print_temp_metric(RSMI_TEMP_CRITICAL, "Temperature critical max value");
print_temp_metric(RSMI_TEMP_CRITICAL_HYST,
"Temperature hysteresis value for critical limit");
print_temp_metric(RSMI_TEMP_EMERGENCY,
"Temperature emergency max value");
print_temp_metric(RSMI_TEMP_EMERGENCY_HYST,
"Temperature hysteresis value for emergency limit");
print_temp_metric(RSMI_TEMP_CRIT_MIN, "Temperature critical min value");
print_temp_metric(RSMI_TEMP_CRIT_MIN_HYST,
"Temperature hysteresis value for critical min value");
print_temp_metric(RSMI_TEMP_OFFSET, "Temperature offset");
print_temp_metric(RSMI_TEMP_LOWEST, "Historical minimum temperature");
print_temp_metric(RSMI_TEMP_HIGHEST, "Historical maximum temperature");
for (type = RSMI_TEMP_TYPE_FIRST; type <= RSMI_TEMP_TYPE_LAST; ++type) {
IF_VERB(STANDARD) {
std::cout << "\t** **********" << kTempSensorNameMap.at(type) <<
" Temperatures **********" << std::endl;
}
print_temp_metric(RSMI_TEMP_CURRENT, "Current Temp.");
print_temp_metric(RSMI_TEMP_MAX, "Temperature max value");
print_temp_metric(RSMI_TEMP_MIN, "Temperature min value");
print_temp_metric(RSMI_TEMP_MAX_HYST,
"Temperature hysteresis value for max limit");
print_temp_metric(RSMI_TEMP_MIN_HYST,
"Temperature hysteresis value for min limit");
print_temp_metric(RSMI_TEMP_CRITICAL, "Temperature critical max value");
print_temp_metric(RSMI_TEMP_CRITICAL_HYST,
"Temperature hysteresis value for critical limit");
print_temp_metric(RSMI_TEMP_EMERGENCY,
"Temperature emergency max value");
print_temp_metric(RSMI_TEMP_EMERGENCY_HYST,
"Temperature hysteresis value for emergency limit");
print_temp_metric(RSMI_TEMP_CRIT_MIN, "Temperature critical min value");
print_temp_metric(RSMI_TEMP_CRIT_MIN_HYST,
"Temperature hysteresis value for critical min value");
print_temp_metric(RSMI_TEMP_OFFSET, "Temperature offset");
print_temp_metric(RSMI_TEMP_LOWEST, "Historical minimum temperature");
print_temp_metric(RSMI_TEMP_HIGHEST, "Historical maximum temperature");
}
}
}

0 comments on commit 11f7143

Please sign in to comment.