Skip to content

Commit

Permalink
Format new file.
Browse files Browse the repository at this point in the history
  • Loading branch information
bethune-bryant committed Aug 2, 2024
1 parent 800bd0d commit bf1a00a
Showing 1 changed file with 46 additions and 22 deletions.
68 changes: 46 additions & 22 deletions gpustat/rocml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,24 @@

from collections import namedtuple





try:
# Check for amdsmi.
from amdsmi import *
except (ImportError, SyntaxError, RuntimeError) as e:
_amdsmi = sys.modules.get('amdsmi', None)
_amdsmi = sys.modules.get("amdsmi", None)

raise ImportError(textwrap.dedent(
"""\
raise ImportError(
textwrap.dedent(
"""\
amdsmi is missing or an outdated version is installed.
The root cause: """ + str(e) +
"""
The root cause: """
+ str(e)
+ """
Your pynvml installation: """ + repr(_amdsmi) +
"""
Your pynvml installation: """
+ repr(_amdsmi)
+ """
-----------------------------------------------------------
(Suggested Fix) Please install amdsmi.
Expand All @@ -40,20 +39,25 @@
cd /opt/rocm/share/amd_smi
python3 -m pip install --upgrade pip
python3 -m pip install --user .
""")) from e
"""
)
) from e

NVML_TEMPERATURE_GPU = 1


class NVMLError(Exception):
def __init__(self, message="ROCM Error"):
self.message = message
super().__init__(self.message)


class NVMLError_Unknown(Exception):
def __init__(self, message="An unknown ROCM Error has occurred"):
self.message = message
super().__init__(self.message)


class NVMLError_GpuIsLost(Exception):
def __init__(self, message="ROCM Device is lost."):
self.message = message
Expand All @@ -64,6 +68,7 @@ def __init__(self, message="ROCM Device is lost."):
_stderr_dup = os.dup(2)
_silent_pipe = os.open(os.devnull, os.O_WRONLY)


def silent_run(to_call, *args, **kwargs):
os.dup2(_silent_pipe, 1)
os.dup2(_silent_pipe, 2)
Expand All @@ -72,30 +77,38 @@ def silent_run(to_call, *args, **kwargs):
os.dup2(_stderr_dup, 2)
return retval


def nvmlDeviceGetCount():
return len(amdsmi_get_processor_handles())


def nvmlDeviceGetHandleByIndex(dev):
return amdsmi_get_processor_handles()[dev]


def nvmlDeviceGetIndex(dev):
for i, handle in enumerate(amdsmi_get_processor_handles()):
if amdsmi_get_gpu_device_bdf(dev) == amdsmi_get_gpu_device_bdf(handle):
return i
return -1


def nvmlDeviceGetName(dev):
return amdsmi_get_gpu_board_info(dev)["product_name"]


def nvmlDeviceGetUUID(dev):
return amdsmi_get_gpu_device_uuid(dev)


def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU):
return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT)


def nvmlSystemGetDriverVersion():
return amdsmi_get_gpu_driver_info(amdsmi_get_processor_handles()[0])["driver_version"]


def check_driver_nvml_version(driver_version_str: str):
"""Show warnings when an incompatible driver is used."""

Expand All @@ -105,42 +118,51 @@ def safeint(v) -> int:
except (ValueError, TypeError):
return 0

driver_version = tuple(safeint(v) for v in
driver_version_str.strip().split("."))
driver_version = tuple(safeint(v) for v in driver_version_str.strip().split("."))

if driver_version < (6, 7, 8):
warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ")


def nvmlDeviceGetFanSpeed(dev):
try:
return amdsmi_get_gpu_fan_speed(dev, 0)
except Exception:
return None

MemoryInfo = namedtuple('MemoryInfo', ['total', 'used'])

MemoryInfo = namedtuple("MemoryInfo", ["total", "used"])


def nvmlDeviceGetMemoryInfo(dev):
return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM),
used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM))
return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM))


UtilizationRates = namedtuple("UtilizationRates", ["gpu"])

UtilizationRates = namedtuple('UtilizationRates', ['gpu'])

def nvmlDeviceGetUtilizationRates(dev):
return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"])


def nvmlDeviceGetEncoderUtilization(dev):
return None


def nvmlDeviceGetDecoderUtilization(dev):
return None


def nvmlDeviceGetPowerUsage(dev):
return amdsmi_get_power_info(dev)["current_socket_power"] * 1000


def nvmlDeviceGetEnforcedPowerLimit(dev):
return amdsmi_get_power_info(dev)["power_limit"] * 1000

ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory'])

ComputeProcess = namedtuple("ComputeProcess", ["pid", "usedGpuMemory"])


def nvmlDeviceGetComputeRunningProcesses(dev):
try:
Expand All @@ -149,20 +171,24 @@ def nvmlDeviceGetComputeRunningProcesses(dev):
except Exception:
return []


def nvmlDeviceGetGraphicsRunningProcesses(dev):
return None


def nvmlDeviceGetClockInfo(dev, clk_type=AmdSmiClkType.SYS):
result = amdsmi_get_clock_info(dev, clk_type)
if "clk" in result:
return result["clk"]
else:
return result["cur_clk"]


def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS):
result = amdsmi_get_clock_info(dev, clk_type)
return result["max_clk"]


# Upon importing this module, let amdsmi be initialized and remain active
# throughout the lifespan of the python process (until gpustat exists).
_initialized: bool
Expand All @@ -173,6 +199,7 @@ def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS):

def _shutdown():
amdsmi_shut_down()

atexit.register(_shutdown)

except Exception as exc:
Expand All @@ -183,6 +210,3 @@ def _shutdown():
def ensure_initialized():
if not _initialized:
raise _init_error # type: ignore



0 comments on commit bf1a00a

Please sign in to comment.