[SWDEV-488276] Add partition 2.0 functionality (#44)

Changes: * CLI: - Updated amd-smi partition - Updated amd-smi partition -c - Updated amd-smi partition -m - Updated amd-smi partition -a - Updated amd-smi set -M <NPS1/NPS2/NPS4/NPS8> - Updated amd-smi set -C <SPX/DPX/QPX/TPX/CPX> - Updated amd-smi set -C <ACCELERATOR_TYPE> or <PROFILE_INDEX> Where PROFILE_INDEX = available ACCELERATOR_TYPES - Updated amd-smi set --help, now includes more detail for amd-smi set -C <ACCELERATOR_TYPE> or <PROFILE_INDEX> * API: - Added amdsmi_get_gpu_memory_partition_config - Added amdsmi_set_gpu_memory_partition_mode - Added amdsmi_get_gpu_accelerator_partition_profile_config - Updated amdsmi_get_gpu_accelerator_partition_profile_config - Added amdsmi_set_gpu_accelerator_partition_profile Signed-off-by: Charis Poag <[email protected]>
ROCm · Jan 16, 2025 · c1cd2b4 · c1cd2b4
1 parent c6bb6ca
commit c1cd2b4
Show file tree

Hide file tree

Showing 18 changed files with 2,639 additions and 449 deletions.
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py
@@ -27,6 +27,7 @@
 import time
 import re
 import multiprocessing
+import json
 
 from typing import List, Union
 from enum import Enum
@@ -681,12 +682,30 @@ def get_perf_levels(self):
         perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel))
         return perf_levels_str, perf_levels_int
 
-
-    def get_compute_partition_types(self):
-        compute_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiComputePartitionType]
-        if 'INVALID' in compute_partitions_str:
-            compute_partitions_str.remove('INVALID')
-        return compute_partitions_str
+    def get_accelerator_partition_profile_config(self):
+        device_handles = amdsmi_interface.amdsmi_get_processor_handles()
+        accelerator_partition_profiles = {'profile_indices':[], 'profile_types':[], 'memory_caps': []}
+        for dev in device_handles:
+            try:
+                profile = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(dev)
+                num_profiles = profile['num_profiles']
+                for p in range(num_profiles):
+                    accelerator_partition_profiles['profile_indices'].append(str(profile['profiles'][p]['profile_index']))
+                    accelerator_partition_profiles['profile_types'].append(profile['profiles'][p]['profile_type'])
+                    accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps'])
+                break # Only need to get the profiles for one device    
+            except amdsmi_interface.AmdSmiLibraryException as e:
+                break
+        return accelerator_partition_profiles
+
+    def get_accelerator_choices_types_indices(self):
+        return_val = ("N/A", {'profile_indices':[], 'profile_types':[]})
+        accelerator_partition_profiles = self.get_accelerator_partition_profile_config()
+        if len(accelerator_partition_profiles['profile_types']) != 0:
+            compute_partitions_str = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices']
+            accelerator_choices = ", ".join(compute_partitions_str)
+            return_val = (accelerator_choices, accelerator_partition_profiles)
+        return return_val
 
     def get_memory_partition_types(self):
         memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType]

diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py
@@ -102,14 +102,24 @@ def _capitalize_keys(self, input_dict):
         return output_dict
 
 
-    def _convert_json_to_tabular(self, json_object: Dict[str, any]):
-        # TODO make dynamic
+    def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
+        # TODO make dynamic - convert other python CLI outputs to use (as needed)
+        # Update: using dynamic=true provides dynamic re-sizing based on key name length
+
         table_values = ''
         stored_gpu = ''
         stored_timestamp = ''
         for key, value in json_object.items():
             string_value = str(value)
-            if key == 'gpu':
+            if key == 'partition_id':
+                # Special case for partition_id: 8 partitions + 7 comma + 2 spaces = 17
+                table_values += string_value.ljust(17)
+                continue
+            key_length = len(key) + 2
+            if dynamic and len(key) > 0:
+                stored_gpu = string_value
+                table_values += string_value.ljust(key_length)
+            elif key == 'gpu':
                 stored_gpu = string_value
                 table_values += string_value.rjust(3)
             elif key == 'timestamp':
@@ -144,30 +154,6 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any]):
             elif key == "link_status":
                 for i in value:
                     table_values += str(i).ljust(3)
-            elif key == "memory":
-                table_values += string_value.ljust(8)
-            elif key == "accelerator_type":
-                table_values += string_value.ljust(18)
-            elif key == "partition_id":
-                table_values += string_value.ljust(14)
-            elif key == "accelerator_profile_index":
-                table_values += string_value.ljust(27)
-            elif key == "profile_index":
-                table_values += string_value.ljust(15)
-            elif key == "memory_partition_caps":
-                table_values += string_value.ljust(23)
-            elif key == "num_partitions":
-                table_values += string_value.ljust(16)
-            elif key == "num_resources":
-                table_values += string_value.ljust(15)
-            elif key == "resource_index":
-                table_values += string_value.ljust(16)
-            elif key == "resource_type":
-                table_values += string_value.ljust(15)
-            elif key == "resource_instances":
-                table_values += string_value.ljust(20)
-            elif key == "resources_shared":
-                table_values += string_value.ljust(18)
             elif key == "RW":
                 table_values += string_value.ljust(57)
             elif key in ('pviol', 'tviol'):
@@ -494,12 +480,14 @@ def store_watch_output(self, multiple_device_enabled=False):
             self.output = {}
 
 
-    def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False):
+    def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False, dynamic=False):
         """ Print current output acording to format and then destination
             params:
                 multiple_device_enabled (bool) - True if printing output from
                     multiple devices
                 watching_output (bool) - True if printing watch output
+                dynamic (bool) - Defaults to False. True turns on dynamic resizing for
+                    left justified table output
             return:
                 Nothing
         """
@@ -516,7 +504,7 @@ def print_output(self, multiple_device_enabled=False, watching_output=False, tab
         elif self.is_human_readable_format():
             # If tabular output is enabled, redirect to _print_tabular_output
             if tabular:
-                self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output)
+                self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output, dynamic=dynamic)
             else:
                 self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
                                                    watching_output=watching_output)
@@ -788,7 +776,7 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
                     output_file.write(human_readable_output + '\n')
 
 
-    def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False):
+    def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False, dynamic=False):
         primary_table = ''
         secondary_table = ''
 
@@ -808,7 +796,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
                 for key, value in device_output.items():
                     if key != 'process_list':
                         primary_table_output[key] = value
-                primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
+                primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
         else: # Single device output
             if 'process_list' in self.output:
                 process_table_dict = {}
@@ -822,7 +810,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
             for key, value in self.output.items():
                 if key != 'process_list':
                     primary_table_output[key] = value
-            primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
+            primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
         primary_table = primary_table.rstrip()
         secondary_table = secondary_table.rstrip()
 
@@ -879,7 +867,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
                         for key, value in device_output.items():
                             if key != 'process_list':
                                 primary_table_output[key] = value
-                        primary_table += self._convert_json_to_tabular(primary_table_output) + '\n'
+                        primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n'
                     primary_table = primary_table.rstrip() # Remove trailing new line
                     secondary_table = secondary_table.rstrip()
 

diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
@@ -173,6 +173,14 @@ def _is_valid_string(self, string_value, sub_arg=None):
         else:
             raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat)
 
+    def _is_command_supported(self, user_input, acceptable_values, command_name):
+        if acceptable_values == "N/A":
+            raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(command_name, self.helpers.get_output_format())
+        elif str(user_input).upper() not in acceptable_values:
+            print(f"Valid inputs are {acceptable_values}")
+            raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(str(user_input).upper(), self.helpers.get_output_format())
+        else:
+            return str(user_input).upper()
 
     def _limit_select(self):
         """Custom action for setting clock limits"""
@@ -401,7 +409,7 @@ def __call__(self, parser, args, values, option_string=None):
         return _CoreSelectAction
 
 
-    def _add_command_modifiers(self, subcommand_parser):
+    def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser):
         json_help = "Displays output in JSON format (human readable by default)."
         csv_help = "Displays output in CSV format (human readable by default)."
         file_help = "Saves output into a file on the provided path (stdout by default)."
@@ -460,7 +468,7 @@ def _validate_cpu_core(self, value):
         return value
 
 
-    def _add_device_arguments(self, subcommand_parser, required=False):
+    def _add_device_arguments(self, subcommand_parser: argparse.ArgumentParser, required=False):
         # Device arguments help text
         gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}"
         vf_help = "Gets general information about the specified VF (timeslice, fb info, …).\
@@ -583,7 +591,7 @@ def __call__(self, parser, args, values, option_string=None):
         return _ValidateOverdrivePercent
 
 
-    def _add_version_parser(self, subparsers, func):
+    def _add_version_parser(self, subparsers: argparse._SubParsersAction, func):
         # Subparser help text
         version_help = "Display version information"
 
@@ -597,7 +605,7 @@ def _add_version_parser(self, subparsers, func):
         self._add_command_modifiers(version_parser)
 
 
-    def _add_list_parser(self, subparsers, func):
+    def _add_list_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_amdgpu_initialized():
             # The list subcommand is only applicable to systems with amdgpu initialized
             return
@@ -619,7 +627,7 @@ def _add_list_parser(self, subparsers, func):
         self._add_device_arguments(list_parser, required=False)
 
 
-    def _add_static_parser(self, subparsers, func):
+    def _add_static_parser(self, subparsers: argparse._SubParsersAction, func):
         # Subparser help text
         static_help = "Gets static information about the specified GPU"
         static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\
@@ -925,7 +933,7 @@ def _add_metric_parser(self, subparsers, func):
         self._add_command_modifiers(metric_parser)
 
 
-    def _add_process_parser(self, subparsers, func):
+    def _add_process_parser(self, subparsers: argparse._SubParsersAction, func):
         if self.helpers.is_hypervisor():
             # Don't add this subparser on Hypervisors
             # This subparser is only available to Guest and Baremetal systems
@@ -969,7 +977,7 @@ def _add_process_parser(self, subparsers, func):
         process_parser.add_argument('-n', '--name', action='store', type=lambda value: self._is_valid_string(value, '--name'), required=False, help=name_help)
 
 
-    def _add_profile_parser(self, subparsers, func):
+    def _add_profile_parser(self, subparsers: argparse._SubParsersAction, func):
         if not (self.helpers.is_windows() and self.helpers.is_hypervisor()):
             # This subparser only applies to Hypervisors
             return
@@ -990,7 +998,7 @@ def _add_profile_parser(self, subparsers, func):
         self._add_device_arguments(profile_parser, required=False)
 
 
-    def _add_event_parser(self, subparsers, func):
+    def _add_event_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_amdgpu_initialized():
             # The event subcommand is only applicable to systems with amdgpu initialized
             return
@@ -1011,7 +1019,7 @@ def _add_event_parser(self, subparsers, func):
         self._add_device_arguments(event_parser, required=False)
 
 
-    def _add_topology_parser(self, subparsers, func):
+    def _add_topology_parser(self, subparsers: argparse._SubParsersAction, func):
         if not(self.helpers.is_baremetal() and self.helpers.is_linux()):
             # This subparser is only applicable to Baremetal Linux
             return
@@ -1059,7 +1067,7 @@ def _add_topology_parser(self, subparsers, func):
         topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help)
 
 
-    def _add_set_value_parser(self, subparsers, func):
+    def _add_set_value_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_linux():
             # This subparser is only applicable to Linux
             return
@@ -1078,9 +1086,9 @@ def _add_set_value_parser(self, subparsers, func):
         set_profile_help = f"Set power profile level (#) or choose one of available profiles:\n\t{power_profile_choices_str}"
         perf_det_choices_str = ", ".join(self.helpers.get_perf_det_levels())
         set_perf_det_help = f"Set performance determinism and select one of the corresponding performance levels:\n\t{perf_det_choices_str}"
-        compute_partition_choices_str = ", ".join(self.helpers.get_compute_partition_types())
+        (accelerator_set_choices, _) = self.helpers.get_accelerator_choices_types_indices()
         memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types())
-        set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}"
+        set_compute_partition_help = f"Set one of the following the accelerator type or profile index:\n\t{accelerator_set_choices}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values."
         set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
         power_cap_min, power_cap_max = self.helpers.get_power_caps()
         power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
@@ -1128,7 +1136,7 @@ def _add_set_value_parser(self, subparsers, func):
                 set_value_exclusive_group.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL')
                 set_value_exclusive_group.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE')
                 set_value_exclusive_group.add_argument('-d', '--perf-determinism', action='store', type=lambda value: self._not_negative_int(value, '--perf-determinism'), required=False, help=set_perf_det_help, metavar='SCLKMAX')
-                set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION')
+                set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), required=False, help=set_compute_partition_help, metavar='<ACCELERATOR_TYPE> or <PROFILE_INDEX>')
                 set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
                 set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS')
                 set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
@@ -1162,7 +1170,7 @@ def _add_set_value_parser(self, subparsers, func):
         self._add_command_modifiers(set_value_parser)
 
 
-    def _add_reset_parser(self, subparsers, func):
+    def _add_reset_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_linux():
             # This subparser is only applicable to Linux
             return
@@ -1215,7 +1223,7 @@ def _add_reset_parser(self, subparsers, func):
         reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help)
 
 
-    def _add_monitor_parser(self, subparsers, func):
+    def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_linux():
             # This subparser is only applicable to Linux
             return
@@ -1314,7 +1322,7 @@ def _add_rocm_smi_parser(self, subparsers, func):
         rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help)
 
 
-    def _add_xgmi_parser(self, subparsers, func):
+    def _add_xgmi_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_amdgpu_initialized():
             # The xgmi subcommand is only applicable to systems with amdgpu initialized
             return
@@ -1344,7 +1352,7 @@ def _add_xgmi_parser(self, subparsers, func):
         xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help)
 
 
-    def _add_partition_parser(self, subparsers, func):
+    def _add_partition_parser(self, subparsers: argparse._SubParsersAction, func):
         if not self.helpers.is_amdgpu_initialized():
             # The partition subcommand is only applicable to systems with amdgpu initialized
             return