Skip to content

Commit

Permalink
[SWDEV-488276] Add partition 2.0 functionality (#44)
Browse files Browse the repository at this point in the history
Changes:
* CLI:
  - Updated amd-smi partition
  - Updated amd-smi partition -c
  - Updated amd-smi partition -m
  - Updated amd-smi partition -a
  - Updated amd-smi set -M <NPS1/NPS2/NPS4/NPS8>
  - Updated amd-smi set -C <SPX/DPX/QPX/TPX/CPX>
  - Updated amd-smi set -C <ACCELERATOR_TYPE> or <PROFILE_INDEX>
    Where PROFILE_INDEX = available ACCELERATOR_TYPES
  - Updated amd-smi set --help, now includes more detail for
    amd-smi set -C <ACCELERATOR_TYPE> or <PROFILE_INDEX>

* API:
  - Added amdsmi_get_gpu_memory_partition_config
  - Added amdsmi_set_gpu_memory_partition_mode
  - Added amdsmi_get_gpu_accelerator_partition_profile_config
  - Updated amdsmi_get_gpu_accelerator_partition_profile_config
  - Added amdsmi_set_gpu_accelerator_partition_profile

Signed-off-by: Charis Poag <[email protected]>
  • Loading branch information
charis-poag-amd authored and Maisam Arif committed Jan 16, 2025
1 parent c6bb6ca commit c1cd2b4
Show file tree
Hide file tree
Showing 18 changed files with 2,639 additions and 449 deletions.
369 changes: 231 additions & 138 deletions amdsmi_cli/amdsmi_commands.py

Large diffs are not rendered by default.

31 changes: 25 additions & 6 deletions amdsmi_cli/amdsmi_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import time
import re
import multiprocessing
import json

from typing import List, Union
from enum import Enum
Expand Down Expand Up @@ -681,12 +682,30 @@ def get_perf_levels(self):
perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel))
return perf_levels_str, perf_levels_int


def get_compute_partition_types(self):
compute_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiComputePartitionType]
if 'INVALID' in compute_partitions_str:
compute_partitions_str.remove('INVALID')
return compute_partitions_str
def get_accelerator_partition_profile_config(self):
device_handles = amdsmi_interface.amdsmi_get_processor_handles()
accelerator_partition_profiles = {'profile_indices':[], 'profile_types':[], 'memory_caps': []}
for dev in device_handles:
try:
profile = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(dev)
num_profiles = profile['num_profiles']
for p in range(num_profiles):
accelerator_partition_profiles['profile_indices'].append(str(profile['profiles'][p]['profile_index']))
accelerator_partition_profiles['profile_types'].append(profile['profiles'][p]['profile_type'])
accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps'])
break # Only need to get the profiles for one device
except amdsmi_interface.AmdSmiLibraryException as e:
break
return accelerator_partition_profiles

def get_accelerator_choices_types_indices(self):
return_val = ("N/A", {'profile_indices':[], 'profile_types':[]})
accelerator_partition_profiles = self.get_accelerator_partition_profile_config()
if len(accelerator_partition_profiles['profile_types']) != 0:
compute_partitions_str = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices']
accelerator_choices = ", ".join(compute_partitions_str)
return_val = (accelerator_choices, accelerator_partition_profiles)
return return_val

def get_memory_partition_types(self):
memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType]
Expand Down
54 changes: 21 additions & 33 deletions amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,24 @@ def _capitalize_keys(self, input_dict):
return output_dict


def _convert_json_to_tabular(self, json_object: Dict[str, any]):
# TODO make dynamic
def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
# TODO make dynamic - convert other python CLI outputs to use (as needed)
# Update: using dynamic=true provides dynamic re-sizing based on key name length

table_values = ''
stored_gpu = ''
stored_timestamp = ''
for key, value in json_object.items():
string_value = str(value)
if key == 'gpu':
if key == 'partition_id':
# Special case for partition_id: 8 partitions + 7 comma + 2 spaces = 17
table_values += string_value.ljust(17)
continue
key_length = len(key) + 2
if dynamic and len(key) > 0:
stored_gpu = string_value
table_values += string_value.ljust(key_length)
elif key == 'gpu':
stored_gpu = string_value
table_values += string_value.rjust(3)
elif key == 'timestamp':
Expand Down Expand Up @@ -144,30 +154,6 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any]):
elif key == "link_status":
for i in value:
table_values += str(i).ljust(3)
elif key == "memory":
table_values += string_value.ljust(8)
elif key == "accelerator_type":
table_values += string_value.ljust(18)
elif key == "partition_id":
table_values += string_value.ljust(14)
elif key == "accelerator_profile_index":
table_values += string_value.ljust(27)
elif key == "profile_index":
table_values += string_value.ljust(15)
elif key == "memory_partition_caps":
table_values += string_value.ljust(23)
elif key == "num_partitions":
table_values += string_value.ljust(16)
elif key == "num_resources":
table_values += string_value.ljust(15)
elif key == "resource_index":
table_values += string_value.ljust(16)
elif key == "resource_type":
table_values += string_value.ljust(15)
elif key == "resource_instances":
table_values += string_value.ljust(20)
elif key == "resources_shared":
table_values += string_value.ljust(18)
elif key == "RW":
table_values += string_value.ljust(57)
elif key in ('pviol', 'tviol'):
Expand Down Expand Up @@ -494,12 +480,14 @@ def store_watch_output(self, multiple_device_enabled=False):
self.output = {}


def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False):
def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False, dynamic=False):
""" Print current output acording to format and then destination
params:
multiple_device_enabled (bool) - True if printing output from
multiple devices
watching_output (bool) - True if printing watch output
dynamic (bool) - Defaults to False. True turns on dynamic resizing for
left justified table output
return:
Nothing
"""
Expand All @@ -516,7 +504,7 @@ def print_output(self, multiple_device_enabled=False, watching_output=False, tab
elif self.is_human_readable_format():
# If tabular output is enabled, redirect to _print_tabular_output
if tabular:
self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output)
self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output, dynamic=dynamic)
else:
self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
watching_output=watching_output)
Expand Down Expand Up @@ -788,7 +776,7 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
output_file.write(human_readable_output + '\n')


def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False):
def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False, dynamic=False):
primary_table = ''
secondary_table = ''

Expand All @@ -808,7 +796,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
for key, value in device_output.items():
if key != 'process_list':
primary_table_output[key] = value
primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
else: # Single device output
if 'process_list' in self.output:
process_table_dict = {}
Expand All @@ -822,7 +810,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
for key, value in self.output.items():
if key != 'process_list':
primary_table_output[key] = value
primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
primary_table = primary_table.rstrip()
secondary_table = secondary_table.rstrip()

Expand Down Expand Up @@ -879,7 +867,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
for key, value in device_output.items():
if key != 'process_list':
primary_table_output[key] = value
primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
primary_table = primary_table.rstrip() # Remove trailing new line
secondary_table = secondary_table.rstrip()

Expand Down
42 changes: 25 additions & 17 deletions amdsmi_cli/amdsmi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,14 @@ def _is_valid_string(self, string_value, sub_arg=None):
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat)

def _is_command_supported(self, user_input, acceptable_values, command_name):
if acceptable_values == "N/A":
raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(command_name, self.helpers.get_output_format())
elif str(user_input).upper() not in acceptable_values:
print(f"Valid inputs are {acceptable_values}")
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(str(user_input).upper(), self.helpers.get_output_format())
else:
return str(user_input).upper()

def _limit_select(self):
"""Custom action for setting clock limits"""
Expand Down Expand Up @@ -401,7 +409,7 @@ def __call__(self, parser, args, values, option_string=None):
return _CoreSelectAction


def _add_command_modifiers(self, subcommand_parser):
def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser):
json_help = "Displays output in JSON format (human readable by default)."
csv_help = "Displays output in CSV format (human readable by default)."
file_help = "Saves output into a file on the provided path (stdout by default)."
Expand Down Expand Up @@ -460,7 +468,7 @@ def _validate_cpu_core(self, value):
return value


def _add_device_arguments(self, subcommand_parser, required=False):
def _add_device_arguments(self, subcommand_parser: argparse.ArgumentParser, required=False):
# Device arguments help text
gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}"
vf_help = "Gets general information about the specified VF (timeslice, fb info, …).\
Expand Down Expand Up @@ -583,7 +591,7 @@ def __call__(self, parser, args, values, option_string=None):
return _ValidateOverdrivePercent


def _add_version_parser(self, subparsers, func):
def _add_version_parser(self, subparsers: argparse._SubParsersAction, func):
# Subparser help text
version_help = "Display version information"

Expand All @@ -597,7 +605,7 @@ def _add_version_parser(self, subparsers, func):
self._add_command_modifiers(version_parser)


def _add_list_parser(self, subparsers, func):
def _add_list_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_amdgpu_initialized():
# The list subcommand is only applicable to systems with amdgpu initialized
return
Expand All @@ -619,7 +627,7 @@ def _add_list_parser(self, subparsers, func):
self._add_device_arguments(list_parser, required=False)


def _add_static_parser(self, subparsers, func):
def _add_static_parser(self, subparsers: argparse._SubParsersAction, func):
# Subparser help text
static_help = "Gets static information about the specified GPU"
static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\
Expand Down Expand Up @@ -925,7 +933,7 @@ def _add_metric_parser(self, subparsers, func):
self._add_command_modifiers(metric_parser)


def _add_process_parser(self, subparsers, func):
def _add_process_parser(self, subparsers: argparse._SubParsersAction, func):
if self.helpers.is_hypervisor():
# Don't add this subparser on Hypervisors
# This subparser is only available to Guest and Baremetal systems
Expand Down Expand Up @@ -969,7 +977,7 @@ def _add_process_parser(self, subparsers, func):
process_parser.add_argument('-n', '--name', action='store', type=lambda value: self._is_valid_string(value, '--name'), required=False, help=name_help)


def _add_profile_parser(self, subparsers, func):
def _add_profile_parser(self, subparsers: argparse._SubParsersAction, func):
if not (self.helpers.is_windows() and self.helpers.is_hypervisor()):
# This subparser only applies to Hypervisors
return
Expand All @@ -990,7 +998,7 @@ def _add_profile_parser(self, subparsers, func):
self._add_device_arguments(profile_parser, required=False)


def _add_event_parser(self, subparsers, func):
def _add_event_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_amdgpu_initialized():
# The event subcommand is only applicable to systems with amdgpu initialized
return
Expand All @@ -1011,7 +1019,7 @@ def _add_event_parser(self, subparsers, func):
self._add_device_arguments(event_parser, required=False)


def _add_topology_parser(self, subparsers, func):
def _add_topology_parser(self, subparsers: argparse._SubParsersAction, func):
if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
# This subparser is only applicable to Baremetal Linux
return
Expand Down Expand Up @@ -1059,7 +1067,7 @@ def _add_topology_parser(self, subparsers, func):
topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help)


def _add_set_value_parser(self, subparsers, func):
def _add_set_value_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_linux():
# This subparser is only applicable to Linux
return
Expand All @@ -1078,9 +1086,9 @@ def _add_set_value_parser(self, subparsers, func):
set_profile_help = f"Set power profile level (#) or choose one of available profiles:\n\t{power_profile_choices_str}"
perf_det_choices_str = ", ".join(self.helpers.get_perf_det_levels())
set_perf_det_help = f"Set performance determinism and select one of the corresponding performance levels:\n\t{perf_det_choices_str}"
compute_partition_choices_str = ", ".join(self.helpers.get_compute_partition_types())
(accelerator_set_choices, _) = self.helpers.get_accelerator_choices_types_indices()
memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types())
set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
set_compute_partition_help = f"Set one of the following the accelerator type or profile index:\n\t{accelerator_set_choices}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values."
set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
power_cap_min, power_cap_max = self.helpers.get_power_caps()
power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
Expand Down Expand Up @@ -1128,7 +1136,7 @@ def _add_set_value_parser(self, subparsers, func):
set_value_exclusive_group.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
set_value_exclusive_group.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
set_value_exclusive_group.add_argument('-d', '--perf-determinism', action='store', type=lambda value: self._not_negative_int(value, '--perf-determinism'), required=False, help=set_perf_det_help, metavar='SCLKMAX')
set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), required=False, help=set_compute_partition_help, metavar='<ACCELERATOR_TYPE> or <PROFILE_INDEX>')
set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS')
set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
Expand Down Expand Up @@ -1162,7 +1170,7 @@ def _add_set_value_parser(self, subparsers, func):
self._add_command_modifiers(set_value_parser)


def _add_reset_parser(self, subparsers, func):
def _add_reset_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_linux():
# This subparser is only applicable to Linux
return
Expand Down Expand Up @@ -1215,7 +1223,7 @@ def _add_reset_parser(self, subparsers, func):
reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help)


def _add_monitor_parser(self, subparsers, func):
def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_linux():
# This subparser is only applicable to Linux
return
Expand Down Expand Up @@ -1314,7 +1322,7 @@ def _add_rocm_smi_parser(self, subparsers, func):
rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help)


def _add_xgmi_parser(self, subparsers, func):
def _add_xgmi_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_amdgpu_initialized():
# The xgmi subcommand is only applicable to systems with amdgpu initialized
return
Expand Down Expand Up @@ -1344,7 +1352,7 @@ def _add_xgmi_parser(self, subparsers, func):
xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help)


def _add_partition_parser(self, subparsers, func):
def _add_partition_parser(self, subparsers: argparse._SubParsersAction, func):
if not self.helpers.is_amdgpu_initialized():
# The partition subcommand is only applicable to systems with amdgpu initialized
return
Expand Down
Loading

0 comments on commit c1cd2b4

Please sign in to comment.