From 48cb5529d2e56ae9eb5a0fd8f19ee2d1c25eb11f Mon Sep 17 00:00:00 2001 From: "Poag, Charis" Date: Tue, 11 Mar 2025 16:38:46 -0500 Subject: [PATCH] [SWDEV-493274/SWDEV-514998] Add AMD SMI partition tests + Add Guest amd-smi static --partition (#127) * [SWDEV-493274/SWDEV-514998] Add AMD SMI partition tests + Add Guest amd-smi static --partition Changes: - Added amd-smi static --partition for guest systems - Added C++ tests for memory and compute (accelerator) partitions - Added Python tests for amdsmi_get_gpu_vram_info(), amdsmi_get_gpu_accelerator_partition_profile_config() - Updated Python tests for amdsmi_get_gpu_accelerator_partition_profile() Now includes more profile and resource detail - Added amdsmi_get_gpu_xcd_counter(); Tests provided for both C++/Python APIs - Added AmdSmiVramType & AmdSmiVramVendor: they were missing python testing required adding. Change-Id: Ib6549d8ccc5fb68726f38745b87c78f890186022 Signed-off-by: Charis Poag --- amdsmi_cli/amdsmi_commands.py | 22 +- amdsmi_cli/amdsmi_helpers.py | 14 +- amdsmi_cli/amdsmi_parser.py | 2 +- example/amd_smi_drm_example.cc | 248 +++- include/amd_smi/amdsmi.h | 21 +- include/amd_smi/impl/amd_smi_gpu_device.h | 1 - include/amd_smi/impl/amd_smi_utils.h | 67 +- py-interface/__init__.py | 3 + py-interface/amdsmi_interface.py | 56 +- py-interface/amdsmi_wrapper.py | 5 +- rocm_smi/include/rocm_smi/rocm_smi_device.h | 10 + rocm_smi/src/rocm_smi.cc | 16 +- rocm_smi/src/rocm_smi_device.cc | 35 +- rocm_smi/src/rocm_smi_gpu_metrics.cc | 2 +- src/amd_smi/amd_smi.cc | 617 +++++++--- src/amd_smi/amd_smi_drm.cc | 35 +- src/amd_smi/amd_smi_gpu_device.cc | 35 +- src/amd_smi/amd_smi_lib_loader.cc | 2 +- src/amd_smi/amd_smi_system.cc | 17 +- src/amd_smi/amd_smi_utils.cc | 384 ++++-- .../functional/computepartition_read_write.cc | 1089 +++++++++++++++++ .../functional/computepartition_read_write.h | 51 + .../functional/gpu_metrics_read.cc | 18 +- tests/amd_smi_test/functional/id_info_read.cc | 28 +- .../functional/memorypartition_read_write.cc | 744 +++++++++++ .../functional/memorypartition_read_write.h | 51 + tests/amd_smi_test/main.cc | 15 +- tests/amd_smi_test/test_base.cc | 160 ++- tests/amd_smi_test/test_base.h | 41 + tests/python_unittest/integration_test.py | 115 ++ 30 files changed, 3505 insertions(+), 399 deletions(-) create mode 100755 tests/amd_smi_test/functional/computepartition_read_write.cc create mode 100755 tests/amd_smi_test/functional/computepartition_read_write.h create mode 100755 tests/amd_smi_test/functional/memorypartition_read_write.cc create mode 100755 tests/amd_smi_test/functional/memorypartition_read_write.h diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 2ea3624460..02b7401734 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -387,6 +387,8 @@ class AMDSMICommands(): args.cache = cache if process_isolation: args.process_isolation = process_isolation + if partition: + args.partition = partition if clock: args.clock = clock # args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list @@ -396,24 +398,22 @@ class AMDSMICommands(): # Store args that are applicable to the current platform current_platform_args = ["asic", "bus", "vbios", "driver", "ras", "vram", "cache", "board", "process_isolation", - "clock"] + "clock", "partition"] current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras, args.vram, args.cache, args.board, args.process_isolation, - args.clock] + args.clock, args.partition] self.helpers.check_required_groups() if self.helpers.is_linux() and self.helpers.is_baremetal(): - if partition: - args.partition = partition if limit: args.limit = limit if soc_pstate: args.soc_pstate = soc_pstate if xgmi_plpd: args.xgmi_plpd = xgmi_plpd - current_platform_args += ["ras", "limit", "partition", "soc_pstate", "xgmi_plpd"] - current_platform_values += [args.ras, args.limit, args.partition, args.soc_pstate, args.xgmi_plpd] + current_platform_args += ["ras", "limit", "soc_pstate", "xgmi_plpd"] + current_platform_values += [args.ras, args.limit, args.soc_pstate, args.xgmi_plpd] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -4240,7 +4240,7 @@ class AMDSMICommands(): if args.compute_partition in accelerator_profiles['profile_types']: compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] index = accelerator_profiles['profile_types'].index(args.compute_partition) - attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]} on {gpu_string}" + attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]}) on {gpu_string}" amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})") elif args.compute_partition in accelerator_profiles['profile_indices']: @@ -4294,7 +4294,7 @@ class AMDSMICommands(): threads = [] k140secs = 140 - string_out = f"Updating memory partition for gpu {gpu_id}" + string_out = f"Updating memory partition for GPU: {gpu_id}" timesToRetryRestartErr = 1 self.helpers.increment_set_count() @@ -4305,9 +4305,9 @@ class AMDSMICommands(): while timesToRetryRestartErr >= 0: timesToRetryRestartErr -= 1 try: - if showProgressBar: # only show reload warning on 1st set + if showProgressBar: # we want to overwrite the previous progress bar t1 = multiprocessing.Process(target=self.helpers.showProgressbar, - args=(string_out, k140secs,)) + args=(string_out, k140secs, True,)) threads.append(t1) t1.start() memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] @@ -4342,7 +4342,7 @@ class AMDSMICommands(): return if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR: # Try again on a failure -> work around for not being able to close libdrm - string_out = f"Trying again - Updating memory partition for gpu {gpu_id}" + string_out = f"Trying again - Updating memory partition for GPU: {gpu_id} " for thread in threads: thread.terminate() thread.join() diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 1f0e22f631..05cd5abcf3 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -969,11 +969,15 @@ class AMDSMIHelpers(): continue return pci_devices - def progressbar(self, it, prefix="", size=60, out=sys.stdout): + def progressbar(self, it, prefix="", size=60, out=sys.stdout, add_newline=False): count = len(it) + if (add_newline): + print("{}\n".format(prefix),end='\r', file=out, flush=False) + else: + print("{}".format(prefix),end='\r', file=out, flush=False) def show(j): x = int(size*j/count) - print("{}[{}{}] {}/{} secs remain".format(prefix, u"█"*x, "."*(size-x), j, count), + print("[{}{}] {}/{} secs remain".format(u"█"*x, "."*(size-x), j, count), end='\r', file=out, flush=True) show(0) for i, item in enumerate(it): @@ -981,10 +985,10 @@ class AMDSMIHelpers(): show(i+1) print("\n\n", end='\r', flush=True, file=out) - def showProgressbar(self, title="", timeInSeconds=13): + def showProgressbar(self, title="", timeInSeconds=13, add_newline=False): if title != "": - title += ": " - for i in self.progressbar(range(timeInSeconds), title, 40): + title += " " + for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline): time.sleep(1) def check_required_groups(self): diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 0e5ac166ad..019e97dd8f 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -695,10 +695,10 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-C', '--clock', action='store', default=False, nargs='*', type=str, required=False, help=clock_help) + static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) # Options to display on Hypervisors and Baremetal if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): - static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help) static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index d58df30180..52c22f5590 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -23,17 +23,18 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include "amd_smi/amdsmi.h" +#include "amd_smi/impl/amd_smi_utils.h" #define CHK_AMDSMI_RET(RET) \ @@ -201,8 +202,62 @@ std::string print_unsigned_int(T value) { return ss.str(); } +static const std::string +computePartitionString(amdsmi_compute_partition_type_t computeParitionType) { + switch (computeParitionType) { + case AMDSMI_COMPUTE_PARTITION_SPX: + return "SPX"; + case AMDSMI_COMPUTE_PARTITION_DPX: + return "DPX"; + case AMDSMI_COMPUTE_PARTITION_TPX: + return "TPX"; + case AMDSMI_COMPUTE_PARTITION_QPX: + return "QPX"; + case AMDSMI_COMPUTE_PARTITION_CPX: + return "CPX"; + default: + return "N/A"; + } +} + +static const std::map +mapStringToSMIComputePartitionTypes { + {"SPX", AMDSMI_COMPUTE_PARTITION_SPX}, + {"DPX", AMDSMI_COMPUTE_PARTITION_DPX}, + {"TPX", AMDSMI_COMPUTE_PARTITION_TPX}, + {"QPX", AMDSMI_COMPUTE_PARTITION_QPX}, + {"CPX", AMDSMI_COMPUTE_PARTITION_CPX}, + {"N/A", AMDSMI_COMPUTE_PARTITION_INVALID} +}; + +static const std::string +memoryPartitionString(amdsmi_memory_partition_type_t memoryParitionType) { + switch (memoryParitionType) { + case AMDSMI_MEMORY_PARTITION_NPS1: + return "NPS1"; + case AMDSMI_MEMORY_PARTITION_NPS2: + return "NPS2"; + case AMDSMI_MEMORY_PARTITION_NPS4: + return "NPS4"; + case AMDSMI_MEMORY_PARTITION_NPS8: + return "NPS8"; + default: + return "N/A"; + } +} + +static const std::map +mapStringToSMIMemoryPartitionTypes { + {"NPS1", AMDSMI_MEMORY_PARTITION_NPS1}, + {"NPS2", AMDSMI_MEMORY_PARTITION_NPS2}, + {"NPS4", AMDSMI_MEMORY_PARTITION_NPS4}, + {"NPS8", AMDSMI_MEMORY_PARTITION_NPS8}, + {"N/A", AMDSMI_MEMORY_PARTITION_UNKNOWN} +}; + int main() { - amdsmi_status_t ret; + amdsmi_status_t ret, ret_set; + const char *err_str; // Init amdsmi for sockets and devices. // Here we are only interested in AMD_GPUS. @@ -248,6 +303,20 @@ int main() { // For each device of the socket, get name and temperature. for (uint32_t j = 0; j < device_count; j++) { + uint32_t device_cnt = 0; + ret = smi_amdgpu_get_device_count(&device_cnt); + CHK_AMDSMI_RET(ret) + std::cout << "Device Count: " << device_cnt << std::endl; + + // Get device index + uint32_t device_index = 0; + ret = smi_amdgpu_get_device_index(processor_handles[j], &device_index); + CHK_AMDSMI_RET(ret) + std::cout << "Device Index: " << device_index << std::endl; + + std::vector p_handles(device_cnt); + ret = smi_amdgpu_get_processor_handle_by_index(device_index, &p_handles[j]); + // Get device type. Since the amdsmi is initialized with // AMD_SMI_INIT_AMD_GPUS, the processor_type must be AMDSMI_PROCESSOR_TYPE_AMD_GPU. processor_type_t processor_type = {}; @@ -286,6 +355,173 @@ int main() { printf("\tAsic serial: 0x%s\n", asic_info.asic_serial); printf("\tNum of Computes: %d\n\n", asic_info.num_of_compute_units); + bool is_power_management_enabled = false; + ret = amdsmi_is_gpu_power_management_enabled(processor_handles[j], + &is_power_management_enabled); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_is_gpu_power_management_enabled:\n"); + printf("\tPower Management Enabled: %s\n\n", + (is_power_management_enabled ? "TRUE" : "FALSE")); + + std::cout << " **Version 1: Accelerator/Compute Partition API Examples**\n"; + char original_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[j], original_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << j << ", " + << mapStringToSMIComputePartitionTypes.at(original_compute_partition) << "): " + << err_str << "\n\n"; + std::cout << "\tCompute Partition (original): " + << original_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << j << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + + for (int partition = static_cast(AMDSMI_COMPUTE_PARTITION_SPX); + partition <= static_cast(AMDSMI_COMPUTE_PARTITION_CPX); + partition++) { + amdsmi_compute_partition_type_t updatePartition + = static_cast(partition); + ret_set = amdsmi_set_gpu_compute_partition(processor_handles[j], + updatePartition); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret_set) + } + std::cout << "\tamdsmi_set_gpu_compute_partition(" << j << ", " + << computePartitionString(updatePartition) << "): " + << err_str << "\n\n"; + + // Get the current compute partition + char current_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[j], + current_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << j << ", " + << computePartitionString(updatePartition) << "): " + << err_str << "\n\n"; + std::cout << "\tCompute Partition (current): " + << current_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << j << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + } + // return to original compute partition + amdsmi_compute_partition_type_t original_compute_partition_type; + if (ret == AMDSMI_STATUS_SUCCESS) { + original_compute_partition_type + = mapStringToSMIComputePartitionTypes.at(original_compute_partition); + } else { + original_compute_partition_type = AMDSMI_COMPUTE_PARTITION_INVALID; + } + std::cout << " Returning to original compute partition (" + << computePartitionString(original_compute_partition_type) << ")\n"; + auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles[j], + original_compute_partition_type); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret_set) + } + std::cout << "\tamdsmi_set_gpu_compute_partition(" << j << ", " + << computePartitionString(original_compute_partition_type) << "): " + << err_str << "\n\n"; + + std::cout << " **Version 1: Memory Partition API Examples**\n"; + char original_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[j], original_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_memory_partition:\n"; + std::cout << "\tamdsmi_get_gpu_memory_partition(" << j << ", " + << mapStringToSMIMemoryPartitionTypes.at(original_memory_partition) << "): " + << err_str << "\n\n"; + std::cout << "\tMemory Partition (original): " + << original_memory_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" << j << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " + << err_str << "\n\n"; + } + + for (int partition = static_cast(AMDSMI_MEMORY_PARTITION_NPS1); + partition <= static_cast(AMDSMI_MEMORY_PARTITION_NPS8); + partition++) { + if (partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS1) + && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS2) + && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS4) + && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS8)) { + continue; + } + amdsmi_memory_partition_type_t updatePartition + = static_cast(partition); + auto ret_set = amdsmi_set_gpu_memory_partition(processor_handles[j], + updatePartition); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret_set) + std::cout << " Output of amdsmi_set_gpu_memory_partition:\n"; + } + std::cout << "\tamdsmi_set_gpu_memory_partition(" << j << ", " + << memoryPartitionString(updatePartition) << "): " + << err_str << "\n\n"; + + // Get the current memory partition + char current_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[j], + current_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret) + std::cout << "\tamdsmi_get_gpu_memory_partition(" << j << ", " + << memoryPartitionString(updatePartition) << "): " + << err_str << "\n\n"; + std::cout << "\tMemory Partition (current): " + << current_memory_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" << j << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " + << err_str << "\n\n"; + } + } + // return to original compute partition + amdsmi_memory_partition_type_t original_memory_partition_type; + if (ret == AMDSMI_STATUS_SUCCESS) { + original_memory_partition_type + = mapStringToSMIMemoryPartitionTypes.at(original_memory_partition); + } else { + original_memory_partition_type = AMDSMI_MEMORY_PARTITION_UNKNOWN; + } + std::cout << " Returning to original memory partition (" + << memoryPartitionString(original_memory_partition_type) + << ")\n"; + ret_set = amdsmi_set_gpu_memory_partition(processor_handles[j], + original_memory_partition_type); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + CHK_AMDSMI_RET(ret_set) + } + std::cout << "\tamdsmi_set_gpu_compute_partition(" << j << ", " + << memoryPartitionString(original_memory_partition_type) << "): " + << err_str << "\n\n"; + + // TODO(amdsmi_team): Add V2 partiton APIs + // Get VRAM info amdsmi_vram_info_t vram_info = {}; ret = amdsmi_get_gpu_vram_info(processor_handles[j], &vram_info); @@ -478,7 +714,7 @@ int main() { block = (amdsmi_gpu_block_t)(block * 2)) { ret = amdsmi_get_gpu_ras_block_features_enabled(processor_handles[j], block, &state); - if (ret != AMDSMI_STATUS_API_FAILED) { + if (ret != AMDSMI_STATUS_API_FAILED && ret != AMDSMI_STATUS_NOT_SUPPORTED) { CHK_AMDSMI_RET(ret) } @@ -520,7 +756,9 @@ int main() { // Get ECC error counts amdsmi_error_count_t err_cnt_info = {}; ret = amdsmi_get_gpu_total_ecc_count(processor_handles[j], &err_cnt_info); - CHK_AMDSMI_RET(ret) + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + } printf(" Output of amdsmi_get_gpu_total_ecc_count:\n"); printf("\tCorrectable errors: %lu\n", err_cnt_info.correctable_count); printf("\tUncorrectable errors: %lu\n\n", @@ -530,7 +768,7 @@ int main() { ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, nullptr); CHK_AMDSMI_RET(ret) if (!num_process) { - printf("No processes found.\n"); + printf("amdsmi_get_gpu_process_list(): No processes found.\n\n"); } else { std::cout << "Processes found: " << num_process << "\n"; amdsmi_proc_info_t process_info_list[num_process]; diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 1d6e9036c6..6f04c0ee9e 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -407,7 +407,7 @@ typedef enum { //!< work together with shared memory AMDSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs //!< work together with shared memory - AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + AMDSMI_COMPUTE_PARTITION_CPX //!< Core mode (CPX)- Per-chip XCC with //!< shared memory } amdsmi_compute_partition_type_t; @@ -5847,6 +5847,25 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, uint32_t sen */ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info); +/** + * @brief Returns the 'xcd_counter' from the GPU metrics associated with the device + * + * @ingroup tagAsicBoardInfo + * + * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf} + * + * @param[in] processor_handle Device which to query + * + * @param[inout] xcd_count a pointer to uint16_t to which the device gpu + * metric unit will be stored. Must be allocated by user. + * + * @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call. + * ::AMDSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device. + */ +amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle, + uint16_t *xcd_count); + /** @} End tagAsicBoardInfo */ /*****************************************************************************/ diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index 568ca60f22..ebe1756537 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -27,7 +27,6 @@ #include "amd_smi/impl/amd_smi_processor.h" #include "amd_smi/impl/amd_smi_drm.h" #include "shared_mutex.h" // NOLINT -#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index 97b9fa2756..8a774988fb 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -26,10 +26,10 @@ #include #include #include +#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_gpu_device.h" -#include "rocm_smi/rocm_smi_utils.h" #define SMIGPUDEVICE_MUTEX(MUTEX) \ @@ -55,6 +55,63 @@ amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDe std::string smi_split_string(std::string str, char delim); std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus); +/** + * @brief Get the device index given the processor handle. + * + * @details Given a processor handle @p processor_handle + * and a pointer to a uint32_t @p device_index will be returned. + * + * @param[in] processor_handle Device which to query + * + * @param[inout] device_index a pointer to uint32_t to which the matching device + * index will be stored + * + * @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call. + * ::AMDSMI_STATUS_INVAL is returned if user provides a null pointer + * for device_index. + * ::AMDSMI_STATUS_API_FAILED is returned if the corresponding device + * index for the processor handle cannot be found. + */ +amdsmi_status_t smi_amdgpu_get_device_index(amdsmi_processor_handle processor_handle, + uint32_t* device_index); + +/** + * @brief Get total number of devices + * + * @details Given a pointer to a uint32_t @p total_num_devices will be returned + * + * @param[inout] total_num_devices a pointer to uint32_t to which the total number + * of devices will be stored + * + * @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call. + * ::AMDSMI_STATUS_INVAL is returned if user provides a null pointer + * for total_num_devices. + */ +amdsmi_status_t smi_amdgpu_get_device_count(uint32_t *total_num_devices); + +/** + * @brief Get the processor handle given the device index. + * + * @details Given a uint32_t @p device_index and a pointer to + * a processor handle @p processor_handle, the device index will be used to + * find the processor handle of the device and store it in the provided pointer + * + * @param[in] device_index a uint32_t to value to help find the corresponding + * processor handle + * + * @param[inout] processor_handle a pointer to amdsmi_processor_handle + * which the corresponding processor_handle will be stored + * + * @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call. + * ::AMDSMI_STATUS_INVAL is returned if user provides a null pointer + * for processor_handle. + * ::AMDSMI_STATUS_API_FAILED is returned if the device_index is cannot + * be found. + */ +amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( + uint32_t device_index, + amdsmi_processor_handle *processor_handle); + template constexpr bool is_dependent_false_v = false; @@ -72,8 +129,7 @@ constexpr T get_std_num_limit() { if constexpr (is_supported_type_v) { return std::numeric_limits::max(); - } - else { + } else { return std::numeric_limits::min(); static_assert(is_dependent_false_v, "Error: Type not supported..."); } @@ -98,12 +154,11 @@ constexpr T translate_umax_or_assign_value(U source_value, V target_value) } return result; - } - else { + } else { static_assert(is_dependent_false_v, "Error: Type not supported..."); } return result; } -#endif // +#endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ diff --git a/py-interface/__init__.py b/py-interface/__init__.py index e1c89f942f..1148cd2f42 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -94,6 +94,7 @@ from .amdsmi_interface import amdsmi_get_gpu_kfd_info from .amdsmi_interface import amdsmi_get_power_cap_info from .amdsmi_interface import amdsmi_get_gpu_vram_info from .amdsmi_interface import amdsmi_get_gpu_cache_info +from .amdsmi_interface import amdsmi_get_gpu_xcd_counter # # Microcode and VBIOS Information from .amdsmi_interface import amdsmi_get_gpu_vbios_info @@ -272,6 +273,8 @@ from .amdsmi_interface import AmdSmiLinkType from .amdsmi_interface import AmdSmiUtilizationCounterType from .amdsmi_interface import AmdSmiProcessorType from .amdsmi_interface import AmdSmiVirtualizationMode +from .amdsmi_interface import AmdSmiVramType +from .amdsmi_interface import AmdSmiVramVendor # Exceptions from .amdsmi_exception import AmdSmiLibraryException diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 8912981ca3..6e4dbe132e 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -460,6 +460,36 @@ class AmdSmiVirtualizationMode(IntEnum): GUEST = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_GUEST PASSTHROUGH = amdsmi_wrapper.AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH +class AmdSmiVramType(IntEnum): + UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_TYPE_UNKNOWN + HBM = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM + HBM2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM2 + HBM2E = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM2E + HBM3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_HBM3 + DDR2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR2 + DDR3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR3 + DDR4 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_DDR4 + GDDR1 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR1 + GDDR2 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR2 + GDDR3 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR3 + GDDR4 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR4 + GDDR5 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR5 + GDDR6 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR6 + GDDR7 = amdsmi_wrapper.AMDSMI_VRAM_TYPE_GDDR7 + MAX = amdsmi_wrapper.AMDSMI_VRAM_TYPE__MAX + +class AmdSmiVramVendor(IntEnum): + SAMSUNG = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_SAMSUNG + INFINEON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_INFINEON + ELPIDA = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_ELPIDA + ETRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_ETRON + NANYA = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_NANYA + HYNIX = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_HYNIX + MOSEL = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MOSEL + WINBOND = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_WINBOND + ESMT = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_ESMT + MICRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MICRON + UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_UNKNOWN class AmdSmiEventReader: def __init__( @@ -2525,6 +2555,18 @@ def amdsmi_get_pcie_info( return pcie_info_dict +def amdsmi_get_gpu_xcd_counter(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException(processor_handle, amdsmi_wrapper.amdsmi_processor_handle) + + xcd_counter = ctypes.c_uint16() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_xcd_counter( + processor_handle, ctypes.byref(xcd_counter) + ) + ) + + return xcd_counter.value def amdsmi_get_processor_handle_from_bdf(bdf): bdf = _parse_bdf(bdf) @@ -2958,15 +3000,11 @@ def amdsmi_get_gpu_accelerator_partition_profile( length = profile.num_partitions partition_ids = [] - for i in range(profile.num_partitions): - partition_ids.append(partition_id_list[i]) - - last_element = 0 - if length > 0: - last_element = length - 1 - if ((partition_ids[last_element] == 0) - and not((profile_type_ret == str("SPX")) or (profile_type_ret == str("N/A")))): - partition_ids = "N/A" + + #partition_id[0] will contain the partition id of each device + #BM/Guest will include this logic. Host will only display primary partition ids. + kPOSITION_OF_PARTITION_ID = 0 + partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) mem_caps_list = [] if profile.memory_caps.nps_flags.nps1_cap == 1: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 6b9d770c8d..08febae9ba 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -2603,6 +2603,9 @@ amdsmi_get_power_cap_info.argtypes = [amdsmi_processor_handle, uint32_t, ctypes. amdsmi_get_pcie_info = _libraries['libamd_smi.so'].amdsmi_get_pcie_info amdsmi_get_pcie_info.restype = amdsmi_status_t amdsmi_get_pcie_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_pcie_info_t)] +amdsmi_get_gpu_xcd_counter = _libraries['libamd_smi.so'].amdsmi_get_gpu_xcd_counter +amdsmi_get_gpu_xcd_counter.restype = amdsmi_status_t +amdsmi_get_gpu_xcd_counter.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint16)] amdsmi_get_fw_info = _libraries['libamd_smi.so'].amdsmi_get_fw_info amdsmi_get_fw_info.restype = amdsmi_status_t amdsmi_get_fw_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_fw_info_t)] @@ -3050,7 +3053,7 @@ __all__ = \ 'amdsmi_get_gpu_virtualization_mode', 'amdsmi_get_gpu_volt_metric', 'amdsmi_get_gpu_vram_info', 'amdsmi_get_gpu_vram_usage', 'amdsmi_get_gpu_vram_vendor', - 'amdsmi_get_gpu_xgmi_link_status', + 'amdsmi_get_gpu_xcd_counter', 'amdsmi_get_gpu_xgmi_link_status', 'amdsmi_get_hsmp_metrics_table', 'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version', 'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest', diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index a74ff680c2..70049644d3 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -200,6 +200,14 @@ class Device { public: explicit Device(std::string path, RocmSMI_env_vars const *e); ~Device(void); + typedef struct { + uint32_t card_index; + uint32_t drm_render_minor; + uint64_t bdfid; + uint64_t kfd_gpu_id; + uint32_t partition_id; + uint32_t smi_device_id; + } rsmi_device_identifiers_t; void set_monitor(std::shared_ptr m) {monitor_ = m;} std::string path(void) const {return path_;} @@ -266,6 +274,8 @@ class Device { void set_smi_device_id(uint32_t device_id) { m_device_id = device_id; } void set_smi_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } static const char* get_type_string(DevInfoTypes type); + rsmi_status_t get_smi_device_identifiers(uint32_t device_id, + rsmi_device_identifiers_t *device_identifiers); private: std::shared_ptr monitor_; diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 83c28cd160..7eaa39c8db 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -6577,6 +6577,16 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { if (ret == RSMI_STATUS_SUCCESS) { *partition_id = static_cast((pci_id >> 28) & 0xf); } + std::ostringstream bdf_sstream; + bdf_sstream << std::hex << std::setfill('0') << std::setw(4) + << ((pci_id >> 32) & 0xFFFFFFFF) << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 8) & 0xFF) << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0xF8) << "."; + bdf_sstream << std::hex << std::setfill('0') << +(pci_id & 0x7); + bdf_sstream << "\nPartition ID ((pci_id >> 28) & 0xf): " << std::dec + << static_cast((pci_id >> 28) & 0xf); + bdf_sstream << "\nPartition ID (pci_id & 0x7): " << std::dec << static_cast(pci_id & 0x7); + // std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl; /** * Fall back is required due to driver changes within KFD. @@ -6603,9 +6613,11 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { << " | Success" << " | Device #: " << dv_ind << " | Type: partition_id" - << " | Data: " << *partition_id + << " | Data: " << static_cast(*partition_id) << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |" + << "\n BDF: " << bdf_sstream.str() << std::endl; + // std::cout << ss.str() << std::endl; LOG_INFO(ss); return ret; CATCH diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 590a8f965f..80f89f08e3 100644 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -1020,7 +1020,7 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { << get_type_string(type) << "), returning *line = " << *line; LOG_INFO(ss); - + fs.close(); return 0; } @@ -1103,6 +1103,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, while (std::getline(fs, line)) { retVec->push_back(line); } + fs.close(); if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" @@ -1771,6 +1772,38 @@ std::string Device::readBootPartitionState( return boot_state; } +rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id, + rsmi_device_identifiers_t *device_identifiers) { + bool found_device = false; + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + if (device_identifiers == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + auto devices = smi.devices(); + + for (uint32_t i = 0; i < devices.size(); i++) { + if (i != device_id) { + continue; + } + rsmi_device_identifiers_t smi_device; + smi_device.card_index = devices[i]->index(); + smi_device.drm_render_minor = devices[i]->drm_render_minor(); + smi_device.bdfid = devices[i]->bdfid(); + smi_device.kfd_gpu_id = devices[i]->kfd_gpu_id(); + smi_device.partition_id = devices[i]->m_partition_id; + smi_device.smi_device_id = i; + *device_identifiers = smi_device; + found_device = true; + break; + } + if (found_device) { + ret = RSMI_STATUS_SUCCESS; + } + return ret; +} + #undef RET_IF_NONZERO } // namespace smi diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index 8c9913775c..68e2c4b2ce 100644 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -3387,7 +3387,7 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m metrics_public_init.vcn_activity[0] = metrics_public_init.average_mm_activity; } // average_mm_activity needs to not be UIN16_MAX and - // metrics_public_init.xcp_stats->vcn_busy[0] should also be UIN16_MAX + // metrics_public_init.xcp_stats->vcn_busy[0] should also be UINT16_MAX if (metrics_public_init.average_mm_activity != UINT16_MAX && metrics_public_init.xcp_stats->vcn_busy[0] == UINT16_MAX) { metrics_public_init.xcp_stats->vcn_busy[0] = metrics_public_init.average_mm_activity; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 767fd257dd..769484626b 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -25,6 +25,9 @@ #include #include #include +#include +#include + #include #include #include @@ -38,7 +41,7 @@ #include #include #include -#include + #include "amd_smi/amdsmi.h" #include "amd_smi/impl/fdinfo.h" #include "amd_smi/impl/amd_smi_common.h" @@ -106,11 +109,15 @@ static const std::mapcheck_if_drm_is_supported()) { - // Populate product_serial, product_name, & product_number from sysfs - status = smi_amdgpu_get_board_info(gpu_device, board_info); - } else { - // ignore the errors so that it can populate as many fields as possible. - // call rocm-smi which search multiple places for device name - status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0, - board_info->product_name, AMDSMI_256_LENGTH); - + status = smi_amdgpu_get_board_info(gpu_device, board_info); + if (board_info->product_serial[0] == '\0') { status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, 0, - board_info->product_serial, AMDSMI_MAX_STRING_LENGTH); + board_info->product_serial, AMDSMI_MAX_STRING_LENGTH); + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->product_serial, 0, + AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_serial[0])); + } + } + if (board_info->product_name[0] == '\0') { + status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0, + board_info->product_name, AMDSMI_256_LENGTH); + // Check if the value is in hex format + if (status == AMDSMI_STATUS_SUCCESS) { + if (board_info->product_name[0] == '0' && board_info->product_name[1] == 'x') { + memset(board_info->product_name, 0, + AMDSMI_256_LENGTH * sizeof(board_info->product_name[0])); + } + } + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->product_name, 0, + AMDSMI_256_LENGTH * sizeof(board_info->product_name[0])); + } } std::ostringstream ss; @@ -815,7 +838,6 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_usage_t *vram_info) { - AMDSMI_CHECK_INIT(); if (vram_info == nullptr) { @@ -842,15 +864,21 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand struct drm_amdgpu_info_vram_gtt gtt; uint64_t vram_used = 0; + amd::smi::AMDSmiSystem::getInstance().init_drm(); r = gpu_device->amdgpu_query_info(AMDGPU_INFO_VRAM_GTT, sizeof(struct drm_amdgpu_memory_info), >t); - if (r != AMDSMI_STATUS_SUCCESS) return r; + if (r != AMDSMI_STATUS_SUCCESS) { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + return r; + } vram_info->vram_total = static_cast( gtt.vram_size / (1024 * 1024)); r = gpu_device->amdgpu_query_info(AMDGPU_INFO_VRAM_USAGE, sizeof(vram_used), &vram_used); + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + if (r != AMDSMI_STATUS_SUCCESS) { return r; } @@ -943,7 +971,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha // default to 0xffffffff as not supported uint32_t partitition_id = std::numeric_limits::max(); auto tmp_partition_id = uint32_t(0); - amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, &(tmp_partition_id)); + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, + &(tmp_partition_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { @@ -955,7 +984,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha processor_handle, &metric_info_a); if (status != AMDSMI_STATUS_SUCCESS) { std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = " << smi_amdgpu_get_status_string(status, false); + ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = " + << smi_amdgpu_get_status_string(status, false); LOG_ERROR(ss); return status; } @@ -1279,7 +1309,6 @@ amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info) { - AMDSMI_CHECK_INIT(); if (info == nullptr) { @@ -1290,15 +1319,26 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i uint16_t vendor_id = 0; uint16_t subvendor_id = 0; + std::ostringstream ss; amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) + if (r != AMDSMI_STATUS_SUCCESS) { return r; + } amdsmi_status_t status; - if (gpu_device->check_if_drm_is_supported()){ - status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, sizeof(struct drm_amdgpu_info_device), &dev_info); - if (status != AMDSMI_STATUS_SUCCESS) return status; + amd::smi::AMDSmiSystem::getInstance().init_drm(); + if (gpu_device->check_if_drm_is_supported()) { + status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, + sizeof(struct drm_amdgpu_info_device), &dev_info); + ss << __PRETTY_FUNCTION__ + << " | amdgpu_query_info(): " + << smi_amdgpu_get_status_string(status, true); + LOG_INFO(ss); + if (status != AMDSMI_STATUS_SUCCESS) { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + return status; + } SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) @@ -1318,12 +1358,13 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->device_id = dev_info.device_id; info->rev_id = dev_info.pci_rev; info->vendor_id = gpu_device->get_vendor_id(); - } - else { + } else { uint64_t dv_uid = 0; status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &dv_uid); - if (status == AMDSMI_STATUS_SUCCESS) snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid); + if (status == AMDSMI_STATUS_SUCCESS) { + snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid); + } status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, info->market_name, AMDSMI_256_LENGTH); @@ -1371,6 +1412,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { info->target_graphics_version = tmp_target_gfx_version; } + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return AMDSMI_STATUS_SUCCESS; } @@ -1494,8 +1536,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) + if (r != AMDSMI_STATUS_SUCCESS) { return r; + } // init the info structure with default value info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN; @@ -1504,6 +1547,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( info->vram_bit_width = std::numeric_limitsvram_bit_width)>::max(); info->vram_max_bandwidth = std::numeric_limitsvram_max_bandwidth)>::max(); + amd::smi::AMDSmiSystem::getInstance().init_drm(); // Only can read vram type from libdrm if (gpu_device->check_if_drm_is_supported()) { struct drm_amdgpu_info_device dev_info = {}; @@ -1559,6 +1603,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( if (r == AMDSMI_STATUS_SUCCESS) { info->vram_size = total / (1024 * 1024); } + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return AMDSMI_STATUS_SUCCESS; } @@ -1780,16 +1825,23 @@ amdsmi_status_t amdsmi_get_gpu_compute_partition(amdsmi_processor_handle processor_handle, char *compute_partition, uint32_t len) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, 0, + std::ostringstream ss; + + auto status = rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, 0, compute_partition, len); + ss << __PRETTY_FUNCTION__ << " | rsmi_dev_compute_partition_get() returned: " + << smi_amdgpu_get_status_string(status, false); + LOG_INFO(ss); + return status; } amdsmi_status_t amdsmi_set_gpu_compute_partition(amdsmi_processor_handle processor_handle, amdsmi_compute_partition_type_t compute_partition) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, 0, + auto ret_resp = rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, 0, static_cast(compute_partition)); + return ret_resp; } // Memory Partition functions @@ -1797,14 +1849,22 @@ amdsmi_status_t amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, char *memory_partition, uint32_t len) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0, + amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0, memory_partition, len); + return ret; } amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, amdsmi_memory_partition_type_t memory_partition) { AMDSMI_CHECK_INIT(); + if (memory_partition != AMDSMI_MEMORY_PARTITION_UNKNOWN + && memory_partition != AMDSMI_MEMORY_PARTITION_NPS1 + && memory_partition != AMDSMI_MEMORY_PARTITION_NPS2 + && memory_partition != AMDSMI_MEMORY_PARTITION_NPS4 + && memory_partition != AMDSMI_MEMORY_PARTITION_NPS8) { + return AMDSMI_STATUS_INVAL; + } std::ostringstream ss; std::lock_guard g(myMutex); @@ -1813,13 +1873,6 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, std::string current_partition_str = "UNKNOWN"; std::string req_user_partition = "UNKNOWN"; - // open libdrm connections prevents the ability to unload driver - amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); - ss << __PRETTY_FUNCTION__ << " | \n" - << "**************************************\n" - << "* Cleaned up - clean_up_drm() *\n" - << "**************************************\n"; - LOG_INFO(ss); req_user_partition.clear(); switch (memory_partition) { case AMDSMI_MEMORY_PARTITION_NPS1: @@ -1843,12 +1896,6 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, if (it != nps_amdsmi_to_RSMI.end()) { rsmi_type = it->second; } else if (it == nps_amdsmi_to_RSMI.end()) { - amd::smi::AMDSmiSystem::getInstance().init_drm(); - ss << __PRETTY_FUNCTION__ << " | Could not find " << req_user_partition << "\n" - << "**************************************\n" - << "* Re-Initialized libdrm - init_drm() *\n" - << "**************************************\n"; - LOG_INFO(ss); return AMDSMI_STATUS_INVAL; } amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, 0, @@ -1862,38 +1909,11 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, current_partition_str = current_partition; } - // WORKAROUND: Re-initialize libdrm connection - // Only re-initialize if the memory partition was correctly set - // otherwise, we can re-try through the CLI. - // This is a workaround for cases which we cannot properly remove libdrm - // connection. - bool drm_reinit = (req_user_partition == current_partition_str - || ret == AMDSMI_STATUS_INVAL - || ret == AMDSMI_STATUS_NOT_SUPPORTED); - if (drm_reinit) { - amd::smi::AMDSmiSystem::getInstance().init_drm(); - ss << __PRETTY_FUNCTION__ << " | \n" - << "**************************************\n" - << "* Re-Initialized libdrm - init_drm() *\n" - << "**************************************\n"; - LOG_INFO(ss); - } - ss << __PRETTY_FUNCTION__ << " | After attepting to set memory partition to " << req_user_partition << "\n" << " | Current memory partition is " << current_partition_str << "\n" - << " | " << (drm_reinit ? - "We were successfully able to restart libdrm" : "We are unable to restart libdrm") << "\n" << " | Returning: " << smi_amdgpu_get_status_string(ret, false); LOG_INFO(ss); - - // TODO(amdsmi_team): issue completely closing -> reopening libdrm on 1st try (workaround above) - // amd::smi::AMDSmiSystem::getInstance().init_drm(); - // ss << __PRETTY_FUNCTION__ << " | \n" - // << "***********************************\n" - // << "* Initialized libdrm - init_drm() *\n" - // << "***********************************\n"; - // LOG_INFO(ss); return ret; } @@ -1902,6 +1922,9 @@ amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, amdsmi_memory_partition_config_t *config) { AMDSMI_CHECK_INIT(); std::ostringstream ss; + if (config == nullptr) { + return AMDSMI_STATUS_INVAL; + } // initialization for devices which do not support partitions amdsmi_nps_caps_t flags; @@ -1911,6 +1934,8 @@ amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, flags.nps_flags.nps8_cap = 0; config->partition_caps = flags; config->mp_mode = AMDSMI_MEMORY_PARTITION_UNKNOWN; + // TODO(amdsmi_team): Will BM/guest VMs have numa ranges? + config->num_numa_ranges = 0; // current memory partition constexpr uint32_t kCurrentPartitionSize = 5; @@ -1937,7 +1962,7 @@ amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, // Add memory partition capabilities here constexpr uint32_t kLenCapsSize = 30; char memory_caps[kLenCapsSize]; - status = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, + auto status_mem_caps = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, processor_handle, 0, memory_caps, kLenCapsSize); ss << __PRETTY_FUNCTION__ @@ -1947,7 +1972,7 @@ amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, << " | Data: " << memory_caps; LOG_DEBUG(ss); std::string memory_caps_str = "N/A"; - if (status == AMDSMI_STATUS_SUCCESS) { + if (status_mem_caps == AMDSMI_STATUS_SUCCESS) { // older kernels may not support this memory_caps_str = std::string(memory_caps); if (memory_caps_str.find("NPS1") != std::string::npos) { flags.nps_flags.nps1_cap = 1; @@ -1985,6 +2010,8 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc LOG_DEBUG(ss); if (profile_config == nullptr) { + ss << __PRETTY_FUNCTION__ << " | profile_config is nullptr" << std::endl; + LOG_ERROR(ss); return AMDSMI_STATUS_INVAL; } @@ -2011,6 +2038,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc // get supported xcp_configs (this will tell use # of profiles/index's) // /sys/class/drm/../device/compute_partition_config/supported_xcp_configs + // otherwise fall back to use /sys/class/drm/../device/available_compute_partition // ex. SPX, DPX, QPX, CPX std::string accelerator_caps_str = "N/A"; constexpr uint32_t kLenXCPConfigSize = 30; @@ -2022,11 +2050,12 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc if (return_status == AMDSMI_STATUS_SUCCESS) { accelerator_caps_str.clear(); accelerator_caps_str = std::string(supported_xcp_configs); + accelerator_caps_str = amd::smi::trimAllWhiteSpace(accelerator_caps_str); use_xcp_config = true; - } else if (return_status == AMDSMI_STATUS_NO_PERM) { // initialize what we can + } else { // initialize what we can ss << __PRETTY_FUNCTION__ << "\n | rsmi_dev_compute_partition_supported_xcp_configs_get()" - << " failed due to no permission" + << " returned: " << smi_amdgpu_get_status_string(return_status, false) << "\n | Defaulting to use rsmi_dev_compute_partition_capabilities_get"; // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); @@ -2036,6 +2065,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc if (return_status == AMDSMI_STATUS_SUCCESS) { accelerator_caps_str.clear(); accelerator_caps_str = std::string(supported_xcp_configs); + accelerator_caps_str = amd::smi::trimAllWhiteSpace(accelerator_caps_str); } else { ss << __PRETTY_FUNCTION__ << "\n | rsmi_dev_compute_partition_capabilities_get() failed, " @@ -2124,6 +2154,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc auto resource_index = 0; // get resource info for each profile for (auto i = 0U; i < profile_config->num_profiles; i++) { + profile_config->profiles[i].num_resources = 0; // start at 0 resources and increment auto it = partition_types_map.find(profile_config->profiles[i].profile_type); std::string partition_type_str = "UNKNOWN"; if (it != partition_types_map.end()) { @@ -2202,9 +2233,35 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc = profile.partition_resource; profile_config->resource_profiles[resource_index].num_partitions_share_resource = profile.num_partitions_share_resource; + auto it3 = + resource_types_map.find( + profile_config->resource_profiles[resource_index].resource_type); + std::string resource_type_str = "UNKNOWN"; + if (it3 != resource_types_map.end()) { + resource_type_str.clear(); + resource_type_str = it3->second; + } + ss << __PRETTY_FUNCTION__ << " | profile_debug 1 " + << "\n profile type: " << partition_type_str + << "\n resource_index: " << resource_index + << "\n profile_index: " << i + << "\n resource_type: " << resource_type_str + << "\n partition_resource: " << profile.partition_resource + << "\n num_partitions_share_resource: " << profile.num_partitions_share_resource + << std::endl; + LOG_DEBUG(ss); resource_index += 1; - profile_config->profiles[i].num_resources - = profile_config->profiles[i].num_resources + 1; + + uint32_t inc_resources = + profile_config->profiles[i].num_resources + 1; + if (inc_resources < static_cast(RSMI_ACCELERATOR_MAX)) { + profile_config->profiles[i].num_resources = inc_resources; + } + ss << __PRETTY_FUNCTION__ << " | profile_debug 2 " + << "\n profile_config->profiles[i].num_resources: " + << profile_config->profiles[i].num_resources + << std::endl; + LOG_DEBUG(ss); } it = partition_types_map.find(profile_config->profiles[i].profile_type); @@ -2255,6 +2312,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc } } ss << __PRETTY_FUNCTION__ + << " | Detailed output" << "\n | profile_config->num_profiles: " << profile_config->num_profiles << "\n | profile_num (i): " << i << "\n | resource_num (r): " << r @@ -2263,8 +2321,10 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc << profile_config->resource_profiles[current_resource_idx].profile_index << "\n | profile_config->profiles[i].memory_caps: " << nps_caps + << "\n***********************************************" << "\n | profile_config->profiles[i].num_resources: " << profile_config->profiles[i].num_resources + << "\n***********************************************" << "\n | profile_type: " << partition_type_str << "\n | resource_type: " << resource_type_str << "\n | partition_resource: " << profile.partition_resource @@ -2282,6 +2342,81 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc LOG_DEBUG(ss); } // END resources loop } // END profile loop + + int res_ind = 0; + for (uint32_t i = 0; i < profile_config->num_profiles; i++) { + auto current_profile = profile_config->profiles[i]; + std::string profile_type_str = "N/A"; + if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + profile_type_str = "SPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + profile_type_str = "DPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + profile_type_str = "TPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + profile_type_str = "QPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + profile_type_str = "CPX"; + } + + std::string nps_caps_str = ""; + if ((current_profile.memory_caps.nps_flags.nps1_cap == 0 + && current_profile.memory_caps.nps_flags.nps2_cap == 0 + && current_profile.memory_caps.nps_flags.nps4_cap == 0 + && current_profile.memory_caps.nps_flags.nps8_cap == 0)) { + nps_caps_str = "N/A"; + } else { + nps_caps_str.clear(); + if (current_profile.memory_caps.nps_flags.nps1_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1"; + } + if (current_profile.memory_caps.nps_flags.nps2_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2"; + } + if (current_profile.memory_caps.nps_flags.nps4_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4"; + } + if (current_profile.memory_caps.nps_flags.nps8_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8"; + } + } + + ss << __PRETTY_FUNCTION__ << " | profile_debug; after compiling info p1 " + << "\n\t**profile_config.profiles[" << i << "]:\n" + << "\t\tprofile_type: " << profile_type_str + << "\n\t\tnum_partitions: " << current_profile.num_partitions + << "\n\t\tmemory_caps: " << nps_caps_str + << "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources + << std::endl; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + + for (uint32_t j = 0; j < current_profile.num_resources; j++) { + auto rp = profile_config->resource_profiles[res_ind]; + + auto it2 = resource_types_map.find(rp.resource_type); + std::string resource_type_str = "UNKNOWN"; + if (it2 != resource_types_map.end()) { + resource_type_str.clear(); + resource_type_str = it2->second; + } + ss << __PRETTY_FUNCTION__ << " | profile_debug; after compiling info p2 " + << "\n\t\t\tprofile_index: " << current_profile.profile_index + << "\n\t\t\tres_ind: " << res_ind + << "\n\t\t\tprofile_config.resource_profiles[" << res_ind + << "].resource_type: " + << resource_type_str + << "\n\t\t\tprofile_config.resource_profiles[" << res_ind + << "].partition_resource: " + << rp.partition_resource + << "\n\t\t\tprofile_config.resource_profiles[" << res_ind + << "].num_partitions_share_resource: " + << rp.num_partitions_share_resource + << std::endl; + LOG_DEBUG(ss); + res_ind++; + } + } ss << __PRETTY_FUNCTION__ << " | END returning " << smi_amdgpu_get_status_string(return_status, false); LOG_INFO(ss); @@ -2294,7 +2429,6 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h amdsmi_accelerator_partition_profile_t *profile, uint32_t *partition_id) { std::ostringstream ss; - AMDSMI_CHECK_INIT(); if (profile == nullptr || partition_id == nullptr) { return AMDSMI_STATUS_INVAL; @@ -2318,29 +2452,39 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h auto tmp_partition_id = uint32_t(0); amdsmi_status_t status = AMDSMI_STATUS_NOT_SUPPORTED; - // get xcp config info (this will tell use # of profiles/index's) + // TODO(amdsmi_team): should we do fallback? + // Info doesn't populate properly if missing other files - CLI FIX? + // Reason: older kernels do not support xcp_configs + + // get supported xcp_configs (this will tell use # of profiles/index's) // /sys/class/drm/../device/compute_partition_config/supported_xcp_configs + // otherwise fall back to use /sys/class/drm/../device/available_compute_partition // ex. SPX, DPX, QPX, CPX // Depending on what is available, we can determine the profile index // ex. SPX = 0, DPX = 1, QPX = 2, CPX = 3; other devices may have different values std::string accelerator_capabilities = "N/A"; - constexpr uint32_t kLenSupportedXCPConfigSize = 30; - char xcp_supported_configs[kLenSupportedXCPConfigSize]; - status = rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0, - xcp_supported_configs, kLenSupportedXCPConfigSize); + constexpr uint32_t kLenXCPConfigSize = 30; + char supported_xcp_configs[kLenXCPConfigSize]; + bool use_xcp_config = false; + status + = rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0, + supported_xcp_configs, kLenXCPConfigSize); if (status == AMDSMI_STATUS_SUCCESS) { accelerator_capabilities.clear(); - accelerator_capabilities = std::string(xcp_supported_configs); - // remove leading/trailing spaces + whitespace - accelerator_capabilities = amd::smi::trimAllWhiteSpace(accelerator_capabilities); + accelerator_capabilities = std::string(supported_xcp_configs); + use_xcp_config = true; } + ss << __PRETTY_FUNCTION__ - << "\n | rsmi_dev_compute_partition_supported_xcp_configs_get Returning: " - << smi_amdgpu_get_status_string(status, false) + << (use_xcp_config ? "\n | Used rsmi_dev_compute_partition_supported_xcp_configs_get()" : + "\n | Used rsmi_dev_compute_partition_capabilities_get()") + << "\n | Returned: " << smi_amdgpu_get_status_string(status, false) << "\n | Type: " - << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) - << "\n | Data (accelerator_capabilities/supported_xcp_configs): " - << accelerator_capabilities; + << (use_xcp_config ? amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs): + amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition)) + << "\n | Data: " << accelerator_capabilities; + + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); // get index by comma and place into a string vector @@ -2349,9 +2493,16 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h std::string temp; std::vector tokens; while (getline(ss_obj, temp, delimiter)) { + temp = amd::smi::trimAllWhiteSpace(temp); tokens.push_back(temp); } + // hold all current available compute partition values within tokens vector + std::ostringstream ss_1; + std::copy(std::begin(tokens), + std::end(tokens), + amd::smi::make_ostream_joiner(&ss_1, ", ")); + constexpr uint32_t kCurrentPartitionSize = 5; char current_partition[kCurrentPartitionSize]; std::string current_partition_str = "N/A"; @@ -2401,31 +2552,15 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h profile->num_partitions = metric_info.num_partition; } - bool isPrimaryNode = false; - for (uint32_t partition_num = 0; partition_num < profile->num_partitions; partition_num++) { - amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, - partition_num, &tmp_partition_id); - if (status == AMDSMI_STATUS_SUCCESS) { - // only create list from primary partition, rest should be array* = {0} - if ((partition_num == 0 && tmp_partition_id == 0) - || (profile->profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) - || (profile->profile_type == AMDSMI_ACCELERATOR_PARTITION_INVALID)) { - isPrimaryNode = true; - partition_id[partition_num] = tmp_partition_id; - ss << __PRETTY_FUNCTION__ - << " | [PRIMARY node confirmed] partition_id[" - << partition_num << "]: " << tmp_partition_id; - LOG_DEBUG(ss); - } else if (isPrimaryNode) { - partition_id[partition_num] = tmp_partition_id; - ss << __PRETTY_FUNCTION__ - << " | [PRIMARY node confirmed - remaining node list] partition_id[" - << partition_num << "]: " << tmp_partition_id; - LOG_DEBUG(ss); - } - } else { - break; - } + status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, + &tmp_partition_id); + const uint32_t partition_num = 0; // Each partition should show the their respective + // partition_id at positon 0 of the array. + // We are no longer populating only the primary partition + // for BM/Guest. + + if (status == AMDSMI_STATUS_SUCCESS) { + partition_id[partition_num] = tmp_partition_id; } std::ostringstream ss_2; @@ -2435,9 +2570,16 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h std::copy(std::begin(copy_partition_ids), std::end(copy_partition_ids), amd::smi::make_ostream_joiner(&ss_2, ", ")); + + auto it_profile_type = partition_types_map.find(profile->profile_type); + std::string partition_type_str = "N/A"; + if (it_profile_type != partition_types_map.end()) { + partition_type_str.clear(); + partition_type_str = it_profile_type->second; + } ss << __PRETTY_FUNCTION__ << " | Num_partitions: " << profile->num_partitions - << "; profile->profile_type: " << profile->profile_type + << "; profile->profile_type: " << profile->profile_type << " (" << partition_type_str << ")" << "; partition_id: " << ss_2.str() << "\n"; LOG_DEBUG(ss); @@ -2470,6 +2612,23 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h } profile->memory_caps = flags; + ss << __PRETTY_FUNCTION__ + << " | END returning " << smi_amdgpu_get_status_string(status, false) << "\n" + << " | accelerator_capabilities: " << accelerator_capabilities << "\n" + << " | current_partition_str: " << current_partition_str << "\n" + << " | std::vector tokens: " << ss_1.str() << "\n" + << " | profile->num_partitions: " << profile->num_partitions << "\n" + << " | profile->profile_type: " << partition_type_str << "\n" + << " | profile->profile_index: " << profile->profile_index << "\n" + << " | profile->num_resources: " << profile->num_resources << "\n" + << " | profile->memory_caps: " << "\n" + << " | nps1_cap: " << profile->memory_caps.nps_flags.nps1_cap << "\n" + << " | nps2_cap: " << profile->memory_caps.nps_flags.nps2_cap << "\n" + << " | nps4_cap: " << profile->memory_caps.nps_flags.nps4_cap << "\n" + << " | nps8_cap: " << profile->memory_caps.nps_flags.nps8_cap << "\n" + << " | partition_id: " << ss_2.str(); + LOG_INFO(ss); + return status; } @@ -2488,6 +2647,22 @@ amdsmi_set_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h std::map mp_prof_indx_to_accel_type; + ss << __PRETTY_FUNCTION__ << " | Invalid profile_index: " << profile_index + << "\n| Max profile_index: " << config.num_profiles - 1 + << "\n| config.num_profiles: " << config.num_profiles + << "\n| profile_index: " << profile_index + << "\n| Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_INVAL, false); + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + if (profile_index >= config.num_profiles) { + ss << __PRETTY_FUNCTION__ << " | Invalid profile_index: " << profile_index + << "\n| Max profile_index: " << config.num_profiles - 1 + << "\n| Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_INVAL, false); + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + return AMDSMI_STATUS_INVAL; + } + for (uint32_t i = 0; i < config.num_profiles; i++) { auto it = partition_types_map.find(config.profiles[i].profile_type); std::string partition_type_str = "N/A"; @@ -2499,17 +2674,24 @@ amdsmi_set_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h ss << __PRETTY_FUNCTION__ << " | " << "config.profiles[" << i << "].profile_type: " << static_cast(config.profiles[i].profile_type) << "\n" - << " | config.profiles[" << i << "].profile_type (str): " + << "| config.profiles[" << i << "].profile_type (str): " << partition_type_str << "\n" << "| config.profiles[" << i << "].profile_index: " << static_cast(config.profiles[i].profile_index) << "\n"; + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); mp_prof_indx_to_accel_type[config.profiles[i].profile_index] = config.profiles[i].profile_type; } auto return_status = amdsmi_set_gpu_compute_partition(processor_handle, static_cast(mp_prof_indx_to_accel_type[profile_index])); + ss << __PRETTY_FUNCTION__ << " | User requested profile_index: " << profile_index + << "\n| Accelerator Type: " + << partition_types_map.at(mp_prof_indx_to_accel_type[profile_index]) + << "\n| Returning: " << smi_amdgpu_get_status_string(return_status, false); + // std::cout << ss.str() << std::endl; + LOG_INFO(ss); return return_status; } @@ -2667,30 +2849,27 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, amdsmi_status_t status; status = get_gpu_device_from_handle(processor_handle, &gpudevice); - if (status != AMDSMI_STATUS_SUCCESS) - { + if (status != AMDSMI_STATUS_SUCCESS) { return status; } // Ignore errors to get as much as possible info. memset(info, 0, sizeof(amdsmi_power_cap_info_t)); - if (gpudevice->check_if_drm_is_supported()) { - // Get power_cap and dpm - int power_cap = 0; - int dpm = 0; - status = smi_amdgpu_get_power_cap(gpudevice, &power_cap); - if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) - set_ret_success = true; + int power_cap = 0; + int dpm = 0; + auto smi_power_cap_status = smi_amdgpu_get_power_cap(gpudevice, &power_cap); + if ((smi_power_cap_status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) + set_ret_success = true; + info->power_cap = power_cap; + status = smi_amdgpu_get_ranges(gpudevice, AMDSMI_CLK_TYPE_GFX, + NULL, NULL, &dpm, NULL); + if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) + set_ret_success = true; + info->dpm_cap = dpm; - info->power_cap = power_cap; - status = smi_amdgpu_get_ranges(gpudevice, AMDSMI_CLK_TYPE_GFX, - NULL, NULL, &dpm, NULL); - if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) - set_ret_success = true; - info->dpm_cap = dpm; - } else { + if (smi_power_cap_status != AMDSMI_STATUS_SUCCESS) { status = rsmi_wrapper(rsmi_dev_power_cap_get, processor_handle, 0, - sensor_ind, &(info->power_cap)); + sensor_ind, &(info->power_cap)); if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; } @@ -3079,17 +3258,19 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios if (status != AMDSMI_STATUS_SUCCESS) return status; - + amd::smi::AMDSmiSystem::getInstance().init_drm(); + // requires libdrm being active & available, if not defaults to rocm_smi if (gpu_device->check_if_drm_is_supported()) { status = gpu_device->amdgpu_query_vbios(&vbios); if (status == AMDSMI_STATUS_SUCCESS) { - strncpy(info->name, (char *) vbios.name, AMDSMI_MAX_STRING_LENGTH); - strncpy(info->build_date, (char *) vbios.date, AMDSMI_MAX_DATE_LENGTH); - strncpy(info->part_number, (char *) vbios.vbios_pn, AMDSMI_MAX_STRING_LENGTH); - strncpy(info->version, (char *) vbios.vbios_ver_str, AMDSMI_MAX_STRING_LENGTH); + strncpy(info->name, reinterpret_cast(vbios.name), AMDSMI_MAX_STRING_LENGTH); + strncpy(info->build_date, reinterpret_cast(vbios.date), AMDSMI_MAX_DATE_LENGTH); + strncpy(info->part_number, reinterpret_cast(vbios.vbios_pn), + AMDSMI_MAX_STRING_LENGTH); + strncpy(info->version, reinterpret_cast(vbios.vbios_ver_str), + AMDSMI_MAX_STRING_LENGTH); } - } - else { + } else { // get vbios version string from rocm_smi char vbios_version[AMDSMI_MAX_STRING_LENGTH]; status = rsmi_wrapper(rsmi_dev_vbios_version_get, processor_handle, 0, @@ -3102,6 +3283,7 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios vbios_version, AMDSMI_MAX_STRING_LENGTH); } } + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return status; } @@ -3255,14 +3437,9 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t return r; amdsmi_status_t status; - if (gpu_device->check_if_drm_is_supported()){ - status = smi_amdgpu_get_bad_page_info(gpu_device, num_pages, info); - if (status != AMDSMI_STATUS_SUCCESS) { - return status; - } - } - else { - // rocm + status = smi_amdgpu_get_bad_page_info(gpu_device, num_pages, info); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; } return AMDSMI_STATUS_SUCCESS; @@ -3343,30 +3520,25 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_ if (status != AMDSMI_STATUS_SUCCESS) return status; - if (gpu_device->check_if_drm_is_supported()){ - amdsmi_ras_err_state_t state = {}; - // Iterate through the ecc blocks - for (auto block = AMDSMI_GPU_BLOCK_FIRST; block <= AMDSMI_GPU_BLOCK_LAST; - block = (amdsmi_gpu_block_t)(block * 2)) { - // Clear the previous ecc block counts - amdsmi_error_count_t block_ec = {}; - // Check if the current ecc block is enabled - status = amdsmi_get_gpu_ras_block_features_enabled(processor_handle, block, &state); - if (status == AMDSMI_STATUS_SUCCESS && state == AMDSMI_RAS_ERR_STATE_ENABLED) { - // Increment the total ecc counts by the ecc block counts - status = amdsmi_get_gpu_ecc_count(processor_handle, block, &block_ec); - if (status == AMDSMI_STATUS_SUCCESS) { - // Increase the total ecc counts - ec->correctable_count += block_ec.correctable_count; - ec->uncorrectable_count += block_ec.uncorrectable_count; - ec->deferred_count += block_ec.deferred_count; - } + amdsmi_ras_err_state_t state = {}; + // Iterate through the ecc blocks + for (auto block = AMDSMI_GPU_BLOCK_FIRST; block <= AMDSMI_GPU_BLOCK_LAST; + block = (amdsmi_gpu_block_t)(block * 2)) { + // Clear the previous ecc block counts + amdsmi_error_count_t block_ec = {}; + // Check if the current ecc block is enabled + status = amdsmi_get_gpu_ras_block_features_enabled(processor_handle, block, &state); + if (status == AMDSMI_STATUS_SUCCESS && state == AMDSMI_RAS_ERR_STATE_ENABLED) { + // Increment the total ecc counts by the ecc block counts + status = amdsmi_get_gpu_ecc_count(processor_handle, block, &block_ec); + if (status == AMDSMI_STATUS_SUCCESS) { + // Increase the total ecc counts + ec->correctable_count += block_ec.correctable_count; + ec->uncorrectable_count += block_ec.uncorrectable_count; + ec->deferred_count += block_ec.deferred_count; } } } - else { - return AMDSMI_STATUS_NOT_SUPPORTED; - } return AMDSMI_STATUS_SUCCESS; } @@ -3472,14 +3644,17 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han int length = AMDSMI_MAX_STRING_LENGTH; // Get the driver version + amd::smi::AMDSmiSystem::getInstance().init_drm(); status = smi_amdgpu_get_driver_version(gpu_device, &length, info->driver_version); // Get the driver date std::string driver_date; status = gpu_device->amdgpu_query_driver_date(driver_date); - if (status != AMDSMI_STATUS_SUCCESS) + if (status != AMDSMI_STATUS_SUCCESS) { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return r; + } // Reformat the driver date from 20150101 to 2015/01/01 00:00 if (driver_date.length() == 8) { driver_date = driver_date.substr(0, 4) + "/" + driver_date.substr(4, 2) @@ -3490,6 +3665,7 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han // Get the driver name std::string driver_name; status = gpu_device->amdgpu_query_driver_name(driver_name); + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); if (status != AMDSMI_STATUS_SUCCESS) return r; strncpy(info->driver_name, driver_name.c_str(), AMDSMI_MAX_STRING_LENGTH-1); @@ -3641,16 +3817,18 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle, + uint16_t *xcd_count) { + return rsmi_wrapper(rsmi_dev_metrics_xcd_counter_get, processor_handle, 0, xcd_count); +} + amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, amdsmi_processor_handle* processor_handle) { amdsmi_status_t status; uint32_t socket_count = 0; - uint32_t device_count = AMDSMI_MAX_DEVICES; - amdsmi_processor_handle devs[AMDSMI_MAX_DEVICES]; - - AMDSMI_CHECK_INIT(); + AMDSMI_CHECK_INIT(); if (processor_handle == nullptr) { return AMDSMI_STATUS_INVAL; @@ -3661,31 +3839,61 @@ amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, return status; } - amdsmi_socket_handle sockets[socket_count]; + std::vector sockets(socket_count); status = amdsmi_get_socket_handles(&socket_count, &sockets[0]); if (status != AMDSMI_STATUS_SUCCESS) { return status; } + std::ostringstream bdf_sstream; + bdf_sstream << __PRETTY_FUNCTION__ + << " | [bdf] domain_number:" << "bus_number:" << "device_number." + << "function_number = "; + bdf_sstream << std::hex << std::setfill('0') << std::setw(4) << bdf.domain_number << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << bdf.bus_number << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << bdf.device_number << "."; + bdf_sstream << std::hex << std::setfill('0') << +bdf.function_number; + // std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl; + LOG_DEBUG(bdf_sstream); for (unsigned int i = 0; i < socket_count; i++) { - status = amdsmi_get_processor_handles(sockets[i], &device_count, devs); + // Get the processor count available for the socket. + uint32_t processor_count = 0; + status = amdsmi_get_processor_handles(sockets[i], &processor_count, nullptr); + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(processor_count); + // Get all processors of the socket + status = amdsmi_get_processor_handles(sockets[i], &processor_count, &processor_handles[0]); if (status != AMDSMI_STATUS_SUCCESS) { return status; } - - for (uint32_t idx = 0; idx < device_count; idx++) { + for (uint32_t idx = 0; idx < processor_count; idx++) { amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; - status = get_gpu_device_from_handle(devs[idx], &gpu_device); + status = get_gpu_device_from_handle(processor_handles[idx], &gpu_device); if (status != AMDSMI_STATUS_SUCCESS) { return status; } amdsmi_bdf_t found_bdf = gpu_device->get_bdf(); + bdf_sstream << __PRETTY_FUNCTION__ + << " | [found_bdf] domain_number:" << "bus_number:" << "device_number." + << "function_number = "; + bdf_sstream << std::hex << std::setfill('0') << std::setw(4) + << found_bdf.domain_number << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) + << found_bdf.bus_number << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) + << found_bdf.device_number << "."; + bdf_sstream << std::hex << std::setfill('0') + << +found_bdf.function_number; + // std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl; + LOG_DEBUG(bdf_sstream); + if ((bdf.bus_number == found_bdf.bus_number) && (bdf.device_number == found_bdf.device_number) && (bdf.domain_number == found_bdf.domain_number) && (bdf.function_number == found_bdf.function_number)) { - *processor_handle = devs[idx]; + *processor_handle = processor_handles[idx]; return AMDSMI_STATUS_SUCCESS; } } @@ -3852,8 +4060,8 @@ amdsmi_get_link_topology_nearest(amdsmi_processor_handle processor_handle, } amdsmi_status_t -amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, amdsmi_virtualization_mode_t *mode) { - +amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, + amdsmi_virtualization_mode_t *mode) { AMDSMI_CHECK_INIT(); if (mode == nullptr) { @@ -3865,19 +4073,35 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, amd amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) + if (r != AMDSMI_STATUS_SUCCESS) { return r; + } amdsmi_status_t status; - if (gpu_device->check_if_drm_is_supported()){ - status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, sizeof(struct drm_amdgpu_info_device), &dev_info); - if (status != AMDSMI_STATUS_SUCCESS) return status; + amd::smi::AMDSmiSystem::getInstance().init_drm(); + // requires libdrm being active + if (gpu_device->check_if_drm_is_supported()) { + status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, + sizeof(struct drm_amdgpu_info_device), &dev_info); + if (status != AMDSMI_STATUS_SUCCESS) { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + return status; + } SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) // get drm version. If it's older than 3.62.0, then say not supported and exit. drmVersionPtr drm_version; - int drm_fd = gpu_device->get_gpu_fd(); + std::string render_name = gpu_device->get_gpu_path(); + int drm_fd = -1; + std::string path = "/dev/dri/" + render_name; + if (render_name != "") { + drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); + } else { + close(drm_fd); + return AMDSMI_STATUS_NOT_SUPPORTED; + } + drm_version = drmGetVersion(drm_fd); // minimum version that supports getting of virtualization mode @@ -3885,20 +4109,27 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, amd int minor_version = 62; int patch_version = 0; - if ((drm_version->version_major <= major_version) && (drm_version->version_minor <= minor_version) && (drm_version->version_patchlevel < patch_version)){ + if ((drm_version->version_major <= major_version) + && (drm_version->version_minor <= minor_version) + && (drm_version->version_patchlevel < patch_version)) { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + close(drm_fd); return AMDSMI_STATUS_NOT_SUPPORTED; } - uint32_t ids_flag = (dev_info.ids_flags & AMDGPU_IDS_FLAGS_MODE_MASK) >> AMDGPU_IDS_FLAGS_MODE_SHIFT; - switch (ids_flag){ + uint32_t ids_flag = ((dev_info.ids_flags & AMDGPU_IDS_FLAGS_MODE_MASK) + >> AMDGPU_IDS_FLAGS_MODE_SHIFT); + switch (ids_flag) { case 0: *mode = AMDSMI_VIRTUALIZATION_MODE_BAREMETAL; break; case 1: *mode = AMDSMI_VIRTUALIZATION_MODE_GUEST; break; case 2: *mode = AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH; break; default: *mode = AMDSMI_VIRTUALIZATION_MODE_UNKNOWN; break; } free(drm_version); - } - else { + close(drm_fd); + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + } else { + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return AMDSMI_STATUS_DRM_ERROR; } diff --git a/src/amd_smi/amd_smi_drm.cc b/src/amd_smi/amd_smi_drm.cc index 211c5c1bed..046db9b9f0 100644 --- a/src/amd_smi/amd_smi_drm.cc +++ b/src/amd_smi/amd_smi_drm.cc @@ -38,24 +38,23 @@ namespace amd { namespace smi { -void closedir(DIR* /*ptr*/) {} std::string AMDSmiDrm::find_file_in_folder(const std::string& folder, const std::string& regex) { std::string file_name; - using dir_ptr = std::unique_ptr; - - struct dirent *dir = nullptr; + DIR *drm_dir; + struct dirent *dir; std::regex file_regex(regex); - auto drm_dir = dir_ptr(opendir(folder.c_str()), &closedir); + drm_dir = opendir(folder.c_str()); if (drm_dir == nullptr) return file_name; std::cmatch m; - while ((dir = readdir(drm_dir.get())) != NULL) { - if (std::regex_search(dir->d_name, m, file_regex)) { - file_name = dir->d_name; - break; - } + while ((dir = readdir(drm_dir)) != nullptr) { + if (std::regex_search(dir->d_name, m, file_regex)) { + file_name = dir->d_name; + break; + } } + closedir(drm_dir); return file_name; } @@ -197,9 +196,9 @@ amdsmi_status_t AMDSmiDrm::cleanup() { close(drm_fds_[i]); } - drm_fds_.clear(); - drm_paths_.clear(); - drm_bdfs_.clear(); + if (!drm_fds_.empty()) {drm_fds_.clear();} + if (!drm_paths_.empty()) {drm_paths_.clear();} + if (!drm_bdfs_.empty()) {drm_bdfs_.clear();} lib_loader_.unload(); return AMDSMI_STATUS_SUCCESS; } @@ -306,9 +305,15 @@ amdsmi_status_t AMDSmiDrm::get_drm_fd_by_index(uint32_t gpu_index, uint32_t *fd_ } amdsmi_status_t AMDSmiDrm::get_bdf_by_index(uint32_t gpu_index, amdsmi_bdf_t *bdf_info) const { - if (gpu_index + 1 > drm_bdfs_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; - *bdf_info = drm_bdfs_[gpu_index]; std::ostringstream ss; + if (gpu_index + 1 > drm_bdfs_.size()) { + ss << __PRETTY_FUNCTION__ << " | gpu_index = " << gpu_index + << "; \nReturning = AMDSMI_STATUS_NOT_SUPPORTED"; + LOG_INFO(ss); + // std::cout << ss.str() << std::endl; + return AMDSMI_STATUS_NOT_SUPPORTED; + } + *bdf_info = drm_bdfs_[gpu_index]; ss << __PRETTY_FUNCTION__ << " | gpu_index = " << gpu_index << "; \nreceived bdf: Domain = " << bdf_info->domain_number << "; \nBus# = " << bdf_info->bus_number diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index 7993abc0fb..db7183ff63 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -20,12 +20,6 @@ * THE SOFTWARE. */ -#include "amd_smi/impl/amd_smi_gpu_device.h" -#include "amd_smi/impl/amd_smi_common.h" -#include "amd_smi/impl/fdinfo.h" -#include "rocm_smi/rocm_smi_kfd.h" -#include "rocm_smi/rocm_smi_utils.h" - #include #include #include @@ -33,6 +27,14 @@ #include #include +#include "amd_smi/impl/amd_smi_gpu_device.h" +#include "amd_smi/impl/amd_smi_common.h" +#include "amd_smi/impl/amd_smi_utils.h" +#include "amd_smi/impl/fdinfo.h" +#include "rocm_smi/rocm_smi_kfd.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_logger.h" + namespace amd { namespace smi { @@ -61,11 +63,32 @@ amdsmi_status_t AMDSmiGPUDevice::get_drm_data() { uint32_t fd = 0; std::string path; amdsmi_bdf_t bdf; + std::ostringstream ss; ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + ss << __PRETTY_FUNCTION__ << " | gpu_id_: " << gpu_id_ + << "; fd: " << fd + << "; drm_.get_drm_fd_by_index(gpu_id_, &fd): " + << smi_amdgpu_get_status_string(ret, false) << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; ret = drm_.get_drm_path_by_index(gpu_id_, &path); + ss << __PRETTY_FUNCTION__ << " | gpu_id_: " << gpu_id_ + << "; path: " << path + << "; drm_.get_drm_fd_by_index(gpu_id_, &path): " + << smi_amdgpu_get_status_string(ret, false) << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; ret = drm_.get_bdf_by_index(gpu_id_, &bdf); + ss << __PRETTY_FUNCTION__ << " | gpu_id_: " << gpu_id_ + << "; domain: " << bdf.domain_number + << "; bus: " << bdf.bus_number + << "; device: " << bdf.device_number + << "; drm_.get_drm_fd_by_index(gpu_id_, &bdf): " + << smi_amdgpu_get_status_string(ret, false) << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; bdf_ = bdf, path_ = path, fd_ = fd; diff --git a/src/amd_smi/amd_smi_lib_loader.cc b/src/amd_smi/amd_smi_lib_loader.cc index b45cf5b1bc..314b96acea 100644 --- a/src/amd_smi/amd_smi_lib_loader.cc +++ b/src/amd_smi/amd_smi_lib_loader.cc @@ -46,7 +46,7 @@ amdsmi_status_t AMDSmiLibraryLoader::load(const char* filename) { if (!libHandler_) { char* error = dlerror(); std::cerr << "Fail to open " << filename <<": " << error - << std::endl; + << std::endl; return AMDSMI_STATUS_FAIL_LOAD_MODULE; } } diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index ea5172ec86..d0e1802271 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -22,13 +22,13 @@ #include #include +#include #include "amd_smi/impl/amd_smi_system.h" #include "amd_smi/impl/amd_smi_gpu_device.h" #include "amd_smi/impl/amd_smi_common.h" +#include "amd_smi/impl/amd_smi_utils.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" -#include - namespace amd { namespace smi { @@ -111,7 +111,6 @@ amdsmi_status_t AMDSmiSystem::init(uint64_t flags) { } #endif return AMDSMI_STATUS_SUCCESS; - } #ifdef ENABLE_ESMI_LIB @@ -160,6 +159,7 @@ amdsmi_status_t AMDSmiSystem::populate_amd_cpus() { #endif amdsmi_status_t AMDSmiSystem::populate_amd_gpu_devices() { + AMDSmiSystem::cleanup(); // init rsmi rsmi_driver_state_t state; rsmi_status_t ret = rsmi_init(0); @@ -262,18 +262,15 @@ amdsmi_status_t AMDSmiSystem::cleanup() { } #endif if (init_flag_ & AMDSMI_INIT_AMD_GPUS) { - for (uint32_t i = 0; i < sockets_.size(); i++) { - delete sockets_[i]; - } - processors_.clear(); - sockets_.clear(); + // we do not need to delete the sockets/processors, clear takes care of this + if (!processors_.empty()) {processors_.clear();} + if (!sockets_.empty()) {sockets_.clear();} init_flag_ &= ~AMDSMI_INIT_AMD_GPUS; + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); rsmi_status_t ret = rsmi_shut_down(); if (ret != RSMI_STATUS_SUCCESS) { return amd::smi::rsmi_to_amdsmi_status(ret); } - - drm_.cleanup(); } return AMDSMI_STATUS_SUCCESS; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index be2509b210..fa9f809d95 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -20,7 +20,9 @@ * THE SOFTWARE. */ -#include +#include +#include +#include #include #include #include @@ -34,6 +36,7 @@ #include #include #include + #include #include #include @@ -42,13 +45,13 @@ #include #include #include -#include #include -#include #include "amd_smi/impl/amd_smi_utils.h" +#include "amd_smi/impl/amd_smi_system.h" #include "shared_mutex.h" // NOLINT #include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi_utils.h" std::string leftTrim(const std::string &s) { if (!s.empty()) { @@ -94,15 +97,33 @@ std::string removeString(const std::string origStr, return modifiedStr; } -void openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff) { +static void clearCharBufferAndReinitialize(char buffer[], uint32_t len, std::string newString) { + char *begin = &buffer[0]; + char *end = &buffer[len]; + std::fill(begin, end, 0); + + // Safer approach - copy directly with length limit + size_t copy_len = std::min(static_cast(len - 1), newString.length()); + if (copy_len > 0) { + std::memcpy(buffer, newString.c_str(), copy_len); + } + buffer[copy_len] = '\0'; + } + +int openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff, + bool trim_whitespace = true) { bool errorDiscovered = false; std::ifstream file(path, std::ifstream::in); std::string contents = {std::istreambuf_iterator{file}, std::istreambuf_iterator{}}; - memset(buff, 0, sizeof(char) * sizeOfBuff); + clearCharBufferAndReinitialize(buff, sizeOfBuff, contents); if (!file.is_open()) { errorDiscovered = true; } else { - contents = trim(contents); + if (trim_whitespace) { + contents = amd::smi::trimAllWhiteSpace(contents); + } + // remove all new lines + contents.erase(std::remove(contents.begin(), contents.end(), '\n'), contents.cend()); } file.close(); @@ -110,6 +131,9 @@ void openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff) { && !contents.empty()) { std::strncpy(buff, contents.c_str(), sizeOfBuff-1); buff[sizeOfBuff-1] = '\0'; + return 0; + } else { + return -1; } } @@ -143,9 +167,6 @@ static bool isAMDGPU(std::string dev_path) { amdsmi_status_t smi_amdgpu_find_hwmon_dir(amd::smi::AMDSmiGPUDevice *device, std::string* full_path) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } if (full_path == nullptr) { return AMDSMI_STATUS_API_FAILED; } @@ -181,9 +202,6 @@ amdsmi_status_t smi_amdgpu_find_hwmon_dir(amd::smi::AMDSmiGPUDevice *device, std amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amdsmi_board_info_t *info) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) std::string model_number_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_number"); std::string product_serial_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/serial_number"); @@ -191,25 +209,34 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd std::string manufacturer_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/manufacturer"); std::string product_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_name"); - openFileAndModifyBuffer(model_number_path, info->model_number, AMDSMI_MAX_STRING_LENGTH); - openFileAndModifyBuffer(product_serial_path, info->product_serial, AMDSMI_MAX_STRING_LENGTH); - openFileAndModifyBuffer(fru_id_path, info->fru_id, AMDSMI_MAX_STRING_LENGTH); - openFileAndModifyBuffer(manufacturer_name_path, info->manufacturer_name, AMDSMI_MAX_STRING_LENGTH); - openFileAndModifyBuffer(product_name_path, info->product_name, AMDSMI_MAX_STRING_LENGTH); + auto ret_mod = openFileAndModifyBuffer(model_number_path, info->model_number, + AMDSMI_MAX_STRING_LENGTH); + auto ret_ser = openFileAndModifyBuffer(product_serial_path, info->product_serial, + AMDSMI_MAX_STRING_LENGTH); + auto ret_fru = openFileAndModifyBuffer(fru_id_path, info->fru_id, AMDSMI_MAX_STRING_LENGTH); + auto ret_man = openFileAndModifyBuffer(manufacturer_name_path, info->manufacturer_name, + AMDSMI_MAX_STRING_LENGTH); + auto ret_prod = openFileAndModifyBuffer(product_name_path, info->product_name, + AMDSMI_MAX_STRING_LENGTH, false); std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "[Before correction] " << "Returning status = AMDSMI_STATUS_SUCCESS" << " | model_number_path = |" << model_number_path << "|\n" << "; info->model_number: |" << info->model_number << "|\n" + << "; ret_mod = " << ret_mod << "|\n" << "\n product_serial_path = |" << product_serial_path << "|\n" << "; info->product_serial: |" << info->product_serial << "|\n" + << "; ret_ser = " << ret_ser << "|\n" << "\n fru_id_path = |" << fru_id_path << "|\n" << "; info->fru_id: |" << info->fru_id << "|\n" + << "; ret_fru = " << ret_fru << "|\n" << "\n manufacturer_name_path = |" << manufacturer_name_path << "|\n" << "; info->manufacturer_name: |" << info->manufacturer_name << "|\n" + << "; ret_man = " << ret_man << "|\n" << "\n product_name_path = |" << product_name_path << "|\n" - << "; info->product_name: |" << info->product_name << "|"; + << "; info->product_name: |" << info->product_name << "|" + << "; ret_prod = " << ret_prod << "|\n"; LOG_INFO(ss); return AMDSMI_STATUS_SUCCESS; @@ -217,9 +244,6 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int *cap) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } constexpr int DATA_SIZE = 16; char val[DATA_SIZE]; std::string fullpath; @@ -251,9 +275,6 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device"; @@ -289,7 +310,7 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ std::ifstream ranges(fullpath.c_str()); if (ranges.fail()) { - return AMDSMI_STATUS_API_FAILED; + return AMDSMI_STATUS_NOT_SUPPORTED; } unsigned int max, min, dpm, sleep_freq; @@ -339,16 +360,13 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ } amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) - std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device/ras/features"; + std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device/ras/features"; std::ifstream f(fullpath.c_str()); std::string tmp_str; if (f.fail()) { - return AMDSMI_STATUS_API_FAILED; + return AMDSMI_STATUS_NOT_SUPPORTED; } std::string line; @@ -372,9 +390,6 @@ amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) std::string line; std::vector badPagesVec; @@ -449,9 +464,6 @@ static uint32_t GetDeviceIndex(const std::string s) { amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) //TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path @@ -475,9 +487,6 @@ amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* dev } amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) //uint32_t index = GetDeviceIndex(device->get_gpu_path()); @@ -487,9 +496,6 @@ amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device } amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) char str[10]; @@ -511,81 +517,26 @@ amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device return AMDSMI_STATUS_SUCCESS; } - amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } SMIGPUDEVICE_MUTEX(device->get_mutex()) - amdsmi_status_t status = AMDSMI_STATUS_SUCCESS; - FILE *fp; - char *tmp, *ptr, *token; - char *ver = NULL; - int i = 0; + amdsmi_status_t status = AMDSMI_STATUS_SUCCESS; size_t len; - - if (length) - len = *length < AMDSMI_MAX_DRIVER_VERSION_LENGTH ? *length : AMDSMI_MAX_DRIVER_VERSION_LENGTH; - else - len = AMDSMI_MAX_DRIVER_VERSION_LENGTH; - - std::string path = "/sys/module/amdgpu/version"; - - fp = fopen(path.c_str(), "r"); - if (fp == nullptr){ - fp = fopen("/proc/version", "r"); - if (fp == nullptr) { - status = AMDSMI_STATUS_IO; - return status; - } - - len = 0; - if (getline(&ver, &len, fp) <= 0) { - status = AMDSMI_STATUS_IO; - fclose(fp); - free(ver); - return status; - } - - fclose(fp); - - ptr = ver; - token = strtok_r(ptr, " ", &tmp); - - if (!token) { - free(ver); - status = AMDSMI_STATUS_IO; - return status; - } - for (i = 0; i < 2; i++) { - ptr = strtok_r(NULL, " ", &tmp); - if (!ptr) - break; - } - if (i != 2 || !ptr) { - free(ver); - status = AMDSMI_STATUS_IO; - return status; - } - if (length) - len = *length < AMDSMI_MAX_DRIVER_VERSION_LENGTH ? *length : - AMDSMI_MAX_DRIVER_VERSION_LENGTH; - else - len = AMDSMI_MAX_DRIVER_VERSION_LENGTH; - - strncpy(version, ptr, len); - free(ver); + if (*length <= 0 || version == nullptr) { + return AMDSMI_STATUS_INVAL; } else { - if ((len = getline(&version, &len, fp)) <= 0) - status = AMDSMI_STATUS_IO; - - fclose(fp); - if (length) { - *length = version[len-1] == '\n' ? static_cast(len - 1) : static_cast(len); - } - version[len-1] = version[len-1] == '\n' ? '\0' : version[len-1]; + len = static_cast(*length); } + std::string empty = ""; + std::strncpy(version, empty.c_str(), len-1); + openFileAndModifyBuffer("/sys/module/amdgpu/version", + version, static_cast(len)); + if (version[0] == '\0') { + openFileAndModifyBuffer("/proc/version", version, static_cast(len)); + if (version[0] == '\0') { + return AMDSMI_STATUS_IO; + } + } return status; } @@ -621,17 +572,37 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice return AMDSMI_STATUS_ARG_PTR_NULL; } + std::ostringstream ss; + // requires libdrm being active if (!device->check_if_drm_is_supported()) { + ss << __PRETTY_FUNCTION__ << " | DRM is not supported"; + LOG_ERROR(ss); return AMDSMI_STATUS_NOT_SUPPORTED; } uint32_t major_version, minor_version; amdgpu_device_handle device_handle = nullptr; + std::string render_name = device->get_gpu_path(); + int fd = -1; + std::string path = "/dev/dri/" + render_name; - uint32_t gpu_fd = device->get_gpu_fd(); + if (render_name != "") { + fd = open(path.c_str(), O_RDWR | O_CLOEXEC); + } else { + market_name[0] = '\0'; + close(fd); + return AMDSMI_STATUS_NOT_SUPPORTED; + } + ss << __PRETTY_FUNCTION__ << " | Render Name: " + << render_name << "; path: " << path << "; fd: " << fd; + LOG_DEBUG(ss); - int ret = amdgpu_device_initialize(gpu_fd, &major_version, &minor_version, &device_handle); + int ret = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle); if (ret != 0) { + std::string empty = ""; + std::strncpy(market_name, empty.c_str(), AMDSMI_256_LENGTH - 1); + amdgpu_device_deinitialize(device_handle); + close(fd); return AMDSMI_STATUS_DRM_ERROR; } @@ -641,19 +612,17 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice std::strncpy(market_name, name, AMDSMI_256_LENGTH - 1); market_name[AMDSMI_256_LENGTH - 1] = '\0'; amdgpu_device_deinitialize(device_handle); + close(fd); return AMDSMI_STATUS_SUCCESS; } amdgpu_device_deinitialize(device_handle); + close(fd); return AMDSMI_STATUS_DRM_ERROR; } amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled) { - if (!device->check_if_drm_is_supported()) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } - if (enabled == nullptr) { return AMDSMI_STATUS_API_FAILED; } @@ -713,3 +682,184 @@ std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus = return std::string(err_str); } +// TODO(amdsmi_team): Do we want to include these functions in header? +amdsmi_status_t smi_amdgpu_get_device_index(amdsmi_processor_handle processor_handle, + uint32_t *device_index) { + uint32_t socket_count; + std::vector sockets; + std::ostringstream ss; + + if (device_index == nullptr) { + return AMDSMI_STATUS_INVAL; + } + *device_index = std::numeric_limits::max(); // set to max value for invalid readings + + auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + // allocate memory + sockets.resize(socket_count); + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + uint32_t current_device_index = 0; + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + ss << __PRETTY_FUNCTION__ << " | Socket " << socket_info << "\n"; + LOG_DEBUG(ss); + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], &device_count, &processor_handles[0]); + ss << __PRETTY_FUNCTION__ << " | Processor Count: " << device_count << "\n"; + LOG_DEBUG(ss); + + for (uint32_t j = 0; j < device_count; j++) { + if (processor_handles[j] == processor_handle) { + *device_index = current_device_index; + ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_SUCCESS " + << "Returning device_index: " << *device_index << "\nSocket #: " << i + << "; Device #: " << j << "; current_device_index #: " << current_device_index + << "\n"; + // std::cout << ss.str(); + LOG_DEBUG(ss); + return AMDSMI_STATUS_SUCCESS; + } + current_device_index++; + } + } + ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_API_FAILED " + << "Returning device_index: " << *device_index << "\n"; + LOG_DEBUG(ss); + return AMDSMI_STATUS_API_FAILED; +} + +// TODO(amdsmi_team): Do we want to include these functions in header? +amdsmi_status_t smi_amdgpu_get_device_count(uint32_t *total_num_devices) { + uint32_t socket_count; + std::vector sockets; + std::ostringstream ss; + + if (total_num_devices == nullptr) { + return AMDSMI_STATUS_INVAL; + } + // set to max value for invalid readings + *total_num_devices = std::numeric_limits::max(); + + auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + // allocate memory + sockets.resize(socket_count); + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + uint32_t device_num = 0; + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + ss << __PRETTY_FUNCTION__ << " | Socket " << socket_info << "\n"; + LOG_DEBUG(ss); + + // Get the processor count available for the socket. + uint32_t processor_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &processor_count, nullptr); + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(processor_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], &processor_count, &processor_handles[0]); + ss << __PRETTY_FUNCTION__ << " | Processor Count: " << processor_count << "\n"; + LOG_DEBUG(ss); + + for (uint32_t j = 0; j < processor_count; j++) { + device_num++; + } + } + *total_num_devices = device_num; + ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_SUCCESS " + << "Returning device_index: " << *total_num_devices << "\n"; + LOG_DEBUG(ss); + return AMDSMI_STATUS_SUCCESS; +} + +// TODO(amdsmi_team): Do we want to include these functions in header? +amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( + uint32_t device_index, + amdsmi_processor_handle *processor_handle) { + uint32_t socket_count; + std::vector sockets; + std::ostringstream ss; + + if (processor_handle == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + // allocate memory + sockets.resize(socket_count); + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + uint32_t current_device_index = 0; + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + ss << __PRETTY_FUNCTION__ << " | Socket " << socket_info << "\n"; + LOG_DEBUG(ss); + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], &device_count, &processor_handles[0]); + ss << __PRETTY_FUNCTION__ << " | Processor Count: " << device_count << "\n"; + LOG_DEBUG(ss); + + for (uint32_t j = 0; j < device_count; j++) { + // std::cout << "current_device_index: " << current_device_index + // << " device_index: " << device_index << std::endl; + if (current_device_index == device_index) { + *processor_handle = processor_handles[j]; + ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_SUCCESS" + << "\nReturning processor_handle for device_index: " << device_index + << "\nSocket #: " << i << "; Device #: " << j + << "; current_device_index #: " << current_device_index + << "; processor_handle: " << *processor_handle + << "; processor_handles[j]: " << processor_handles[j] + << "\n"; + // std::cout << ss.str(); + LOG_DEBUG(ss); + return AMDSMI_STATUS_SUCCESS; + } + current_device_index++; + } + } + ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_API_FAILED " + << "Could not find matching processor_handle for device_index: " << device_index << "\n"; + LOG_DEBUG(ss); + return AMDSMI_STATUS_API_FAILED; +} diff --git a/tests/amd_smi_test/functional/computepartition_read_write.cc b/tests/amd_smi_test/functional/computepartition_read_write.cc new file mode 100755 index 0000000000..a711d9fac3 --- /dev/null +++ b/tests/amd_smi_test/functional/computepartition_read_write.cc @@ -0,0 +1,1089 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include // NOLINT [build] +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "../test_base.h" +#include "../test_common.h" +#include "amd_smi/amdsmi.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "amd_smi/impl/amd_smi_utils.h" +#include "computepartition_read_write.h" + +TestComputePartitionReadWrite::TestComputePartitionReadWrite() : TestBase() { + set_title("AMDSMI Compute Partition Read/Write Test"); + set_description("The Compute Partition tests verifies that the compute " + "partition can be read and updated properly."); +} + +TestComputePartitionReadWrite::~TestComputePartitionReadWrite(void) { +} + +void TestComputePartitionReadWrite::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestComputePartitionReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestComputePartitionReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestComputePartitionReadWrite::Close() { + // This will close handles opened within rsmitst utility calls and call + // amdsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +const uint32_t MAX_UNSUPPORTED_PARTITIONS = 0; +const uint32_t MAX_SPX_PARTITIONS = 1; // Single GPU node +const uint32_t MAX_DPX_PARTITIONS = 2; +const uint32_t MAX_TPX_PARTITIONS = 3; +const uint32_t MAX_QPX_PARTITIONS = 4; +// const uint32_t MAX_CPX_PARTITIONS = 8; + +static const std::string +computePartitionString(amdsmi_compute_partition_type_t computeParitionType) { +/** + * typedef enum { + * AMDSMI_COMPUTE_PARTITION_INVALID = 0, + * AMDSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + * //!< together with shared memory + * AMDSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + * //!< together with shared memory + * AMDSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + * //!< work together with shared memory + * AMDSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + * //!< work together with shared memory + * AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + * //!< shared memory + * } amdsmi_compute_partition_type_t; + * */ + switch (computeParitionType) { + case AMDSMI_COMPUTE_PARTITION_SPX: + return "SPX"; + case AMDSMI_COMPUTE_PARTITION_DPX: + return "DPX"; + case AMDSMI_COMPUTE_PARTITION_TPX: + return "TPX"; + case AMDSMI_COMPUTE_PARTITION_QPX: + return "QPX"; + case AMDSMI_COMPUTE_PARTITION_CPX: + return "CPX"; + default: + return "N/A"; + } +} + +static const std::map +mapStringToSMIComputePartitionTypes { + {"SPX", AMDSMI_COMPUTE_PARTITION_SPX}, + {"DPX", AMDSMI_COMPUTE_PARTITION_DPX}, + {"TPX", AMDSMI_COMPUTE_PARTITION_TPX}, + {"QPX", AMDSMI_COMPUTE_PARTITION_QPX}, + {"CPX", AMDSMI_COMPUTE_PARTITION_CPX}, + {"UNKNOWN", AMDSMI_COMPUTE_PARTITION_INVALID} +}; + +static const std::map resource_types_map = { + { AMDSMI_ACCELERATOR_XCC, "XCC" }, + { AMDSMI_ACCELERATOR_ENCODER, "ENCODER" }, + { AMDSMI_ACCELERATOR_DECODER, "DECODER" }, + { AMDSMI_ACCELERATOR_DMA, "DMA" }, + { AMDSMI_ACCELERATOR_JPEG, "JPEG" }, + { AMDSMI_ACCELERATOR_MAX, "MAX" }, +}; + +static const std::map partition_types_map = { + { AMDSMI_ACCELERATOR_PARTITION_INVALID, "N/A" }, + { AMDSMI_ACCELERATOR_PARTITION_SPX, "SPX" }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, "DPX" }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, "TPX" }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, "QPX" }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, "CPX" }, + { AMDSMI_ACCELERATOR_PARTITION_MAX, "MAX" }, +}; + +static const std::map accelerator_types_map = { + { AMDSMI_ACCELERATOR_PARTITION_INVALID, "AMDSMI_ACCELERATOR_PARTITION_INVALID" }, + { AMDSMI_ACCELERATOR_PARTITION_SPX, "AMDSMI_ACCELERATOR_PARTITION_SPX" }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, "AMDSMI_ACCELERATOR_PARTITION_DPX" }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, "AMDSMI_ACCELERATOR_PARTITION_TPX" }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, "AMDSMI_ACCELERATOR_PARTITION_QPX" }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, "AMDSMI_ACCELERATOR_PARTITION_CPX" }, + { AMDSMI_ACCELERATOR_PARTITION_MAX, "AMDSMI_ACCELERATOR_PARTITION_MAX" }, +}; + +static void system_wait(int seconds) { + // Adding a delay - since changing partitions depends on gpus not + // being in an active state, we'll wait a few seconds before starting + // full testing + auto start = std::chrono::high_resolution_clock::now(); + int waitTime = seconds; + std::cout << "** Waiting for " + << std::dec << waitTime + << " seconds, for any GPU" + << " activity to clear up. **" << std::endl; + sleep(waitTime); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(stop - start); + std::cout << "** Waiting took " << duration.count() / 1000000 + << " seconds **" << std::endl; +} + +static void getProcessorHandles(amdsmi_processor_handle* processor_handles, + uint32_t curr_num_devices) { + if (processor_handles == nullptr) { + // resize the processor_handles array + processor_handles = new amdsmi_processor_handle[curr_num_devices]; + } + + for (uint32_t i = 0; i < curr_num_devices; i++) { + amdsmi_processor_handle p_handle = {}; + smi_amdgpu_get_processor_handle_by_index(i, &p_handle); + processor_handles[i] = p_handle; + + std::cout << "\t**getProcessorHandles() | processor_handles[" + << std::setfill('0') << std::setw(2) + << i << "]:\t" << processor_handles[i] << std::endl; + } +} + +static void checkPartitionIdChanges(amdsmi_processor_handle* const processor_handle, + uint32_t dev_id, + const std::string current_partition, + bool isVerbose, + bool reinitialize) { + uint32_t max_loop = 0; + uint32_t current_num_devices = 0; + uint32_t dev = 0; + uint32_t prev_num_devices = 0; + smi_amdgpu_get_device_count(&prev_num_devices); + + // re-initialize to ensure new device ordering is followed + if (reinitialize) { + if (isVerbose) { + std::cout << "\t**REINITIALIZING device list due to parition changes.\n"; + } + amdsmi_shut_down(); + amdsmi_init(AMDSMI_INIT_AMD_GPUS); + } + + smi_amdgpu_get_device_count(¤t_num_devices); + + // std::cout << "\t**Checking Partition ID Changes 3\n"; + if (isVerbose) { + std::cout << "\t**Device (dev) #: " << dev + << "; Device dev_id: " << dev_id + << "; max_loop: " << static_cast(max_loop) + << "; current_num_devices: " << current_num_devices << "\n"; + } + // Allocate the memory for the device handlers on the socket + std::vector curr_processor_handles(current_num_devices); + getProcessorHandles(&curr_processor_handles[0], current_num_devices); + + if (current_partition == "SPX" || current_partition == "N/A") { + max_loop = MAX_SPX_PARTITIONS; + } else if (current_partition == "DPX") { + max_loop = MAX_DPX_PARTITIONS; + } else if (current_partition == "TPX") { + max_loop = MAX_TPX_PARTITIONS; + } else if (current_partition == "QPX") { + max_loop = MAX_QPX_PARTITIONS; + } else if (current_partition == "CPX") { + uint16_t num_xcd; + + auto ret = amdsmi_get_gpu_xcd_counter(curr_processor_handles[dev_id], &num_xcd); + if (ret == AMDSMI_STATUS_SUCCESS) { + max_loop = static_cast(num_xcd); + if (isVerbose) { + std::cout << "\t**Expecting num_xcd = " << num_xcd << " to equal " + "total CPX nodes\n"; + } + } + } + + if (dev_id + max_loop > current_num_devices) { + if (isVerbose) { + std::cout + << "\t**[WARNING] Readjusting dev_id (was " << dev_id << ")= " << current_num_devices + << " - " << max_loop << ": " << (current_num_devices - max_loop) << "\n"; + } + dev_id = current_num_devices - max_loop; + } + + for (uint32_t i = dev_id; i < dev_id + max_loop; i++) { + if (isVerbose) { + std::cout << "\t**checkPartitionIdChanges DEVICE INFO ===============\n"; + std::cout << "\t**Device (i): " << static_cast(i) << std::endl; + std::cout << "\t**dev_id: " << static_cast(dev_id) << std::endl; + std::cout << "\t**Device Index: " << static_cast(dev) << std::endl; + + std::cout << "\t**Processor Handle: " << processor_handle[i] << std::endl; + std::cout << "\t**Current Processor Handle: " << curr_processor_handles[i] << std::endl; + std::cout << "\t**Current # of devices: " << static_cast(current_num_devices) + << std::endl; + std::cout << "\t**END checkPartitionIdChanges DEVICE INFO =============\n"; + std::cout << "\t**Device (i) #: " << i + << "; Device dev_id: " << dev_id + << "\n\t\t** max_loop: " << static_cast(max_loop) + << "\n\t\t** current_num_devices: " << current_num_devices << "\n"; + } + if (i >= current_num_devices) { + if (isVerbose) { + std::cout << "\t**[WARNING] Detected max DRM minor limitation " + "(max of 64).\n\tPlease disable any other drivers taking up PCIe space" + "\n\t(ex. ast or other drivers -> " + "\"sudo rmmod amdgpu && sudo rmmod ast && sudo modprobe amdgpu\")." + "\n\tCPX may not enumerate properly.\n"; + } + break; + } + amdsmi_kfd_info_t kfd_info; + amdsmi_status_t ret = amdsmi_get_gpu_kfd_info(curr_processor_handles[i], &kfd_info); + if (isVerbose) { + std::cout << "\t**Checking Partition ID | Device: " << std::to_string(i) + << "\n\t\t**Current Partition: " << current_partition + << "\n\t\t**Max partition IDs to check: " << max_loop + << "\n\t\t**Current Partition ID: " << std::to_string(kfd_info.current_partition_id) + << "\n"; + } + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "SPX") { + EXPECT_TRUE(kfd_info.current_partition_id <= max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " << max_loop + << " for SPX" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } else if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "DPX") { + EXPECT_TRUE(kfd_info.current_partition_id <= max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " << max_loop + << " for DPX" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } else if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "TPX") { + EXPECT_TRUE(kfd_info.current_partition_id <= max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for TPX" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } else if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "QPX") { + EXPECT_TRUE(kfd_info.current_partition_id <= max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for QPX" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } else if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "CPX") { + EXPECT_TRUE(kfd_info.current_partition_id <= max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for CPX" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } else if (ret == AMDSMI_STATUS_SUCCESS && current_partition == "N/A") { + EXPECT_EQ(kfd_info.current_partition_id, max_loop - 1); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id = " + << (max_loop - 1) + << " for current_partition = N/A" + << "\n\t**amdsmi_get_gpu_kfd_info(" + std::to_string(i) + + ", &kfd_info); kfd_info.partition_id: " + << static_cast(kfd_info.current_partition_id) << std::endl; + } + } + } +} + + +std::string getResourceType(amdsmi_accelerator_partition_resource_type_t resource_type) { + std::string resource_type_str = ""; + switch (resource_type) { + case AMDSMI_ACCELERATOR_XCC: + resource_type_str = "XCC"; + break; + case AMDSMI_ACCELERATOR_ENCODER: + resource_type_str = "ENCODER"; + break; + case AMDSMI_ACCELERATOR_DECODER: + resource_type_str = "DECODER"; + break; + case AMDSMI_ACCELERATOR_DMA: + resource_type_str = "DMA"; + break; + case AMDSMI_ACCELERATOR_JPEG: + resource_type_str = "JPEG"; + break; + case AMDSMI_ACCELERATOR_MAX: + resource_type_str = "MAX"; + break; + default: + resource_type_str = "N/A"; + break; + } + return resource_type_str; +} + +void TestComputePartitionReadWrite::Run(void) { + amdsmi_status_t ret; + constexpr uint32_t k255Len = 255; + char orig_char_computePartition[k255Len]; + orig_char_computePartition[0] = '\0'; + char current_char_computePartition[k255Len]; + current_char_computePartition[0] = '\0'; + const uint32_t kMAX_UINT32 = std::numeric_limits::max(); + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + bool isVerbose = (this->verbosity() && + this->verbosity() >= (this->TestBase::VERBOSE_STANDARD)) ? true: false; + // Confirm system supports compute partition, before executing wait + ret = amdsmi_get_gpu_compute_partition(0, orig_char_computePartition, k255Len); + if (ret == AMDSMI_STATUS_SUCCESS) { + system_wait(15); + } + + IF_VERB(STANDARD) { + std::cout << "\t**======================================================================\n"; + std::cout << "\t**Test #1: Get/Set Compute Partition (old functionality) ===============\n"; + std::cout << "\t**======================================================================\n"; + } + + // // TEST 1: Set/Get Compute Partition (old functionality) + uint32_t initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind != 0) { + std::cout << "\n"; + } + PrintDeviceHeader(processor_handles_[dv_ind]); + + ret = amdsmi_get_gpu_compute_partition(processor_handles_[dv_ind], orig_char_computePartition, + k255Len); + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_compute_partition: " + << "Not supported on this machine" << std::endl; + } + continue; + } + for (int partition = static_cast(AMDSMI_COMPUTE_PARTITION_SPX); + partition <= static_cast(AMDSMI_COMPUTE_PARTITION_CPX); + partition++) { + amdsmi_compute_partition_type_t updatePartition + = static_cast(partition); + + IF_VERB(STANDARD) { + std::cout << "\t**" + << "======== TEST AMDSMI_COMPUTE_PARTITION_" + << computePartitionString(updatePartition) + << " ===============" << std::endl; + } + // waitForUserInput(); // watch for any errors + + auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_compute_partition(processor_handles_[" + << dv_ind << "], " << computePartitionString(updatePartition) << "): " + << smi_amdgpu_get_status_string(ret_set, false) << "\n" + << "\t**Requested Set Partition: " + << computePartitionString(updatePartition) << "\n" + << "\t**Original Partition: " << orig_char_computePartition + << std::endl; + } + EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE + || ret_set== AMDSMI_STATUS_NO_PERM + || ret_set == AMDSMI_STATUS_SUCCESS + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED + || ret_set == AMDSMI_STATUS_INVAL); + if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_compute_partition: " + << "Not supported on this machine" << std::endl; + } + continue; + } + if (ret_set == AMDSMI_STATUS_INVAL) { + std::cout << "\t**" + << "1st Test: Due to invalid args, skipping rest of test for this device." + << "\n\t Device might be in a static partition mode. " + << "With inability to change partition modes." + << std::endl; + break; + } + + ret = amdsmi_get_gpu_compute_partition(processor_handles_[dv_ind], + current_char_computePartition, + k255Len); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_compute_partition(processor_handles_[" << dv_ind << "], " + << current_char_computePartition << "): " + << smi_amdgpu_get_status_string(ret, false) + << "\n\t**Current Partition (get): " + << current_char_computePartition + << std::endl; + } + if (ret_set == AMDSMI_STATUS_SUCCESS) { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + EXPECT_EQ(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(current_char_computePartition))); + } else { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(current_char_computePartition))); + } + } + amdsmi_compute_partition_type_t updatePartition = + static_cast( + mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + // waitForUserInput(); // watch for any errors on going back to original partition + auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition); + EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE + || ret_set== AMDSMI_STATUS_NO_PERM + || ret_set == AMDSMI_STATUS_SUCCESS + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED + || ret_set == AMDSMI_STATUS_INVAL); + } + + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t**======================================================================\n"; + std::cout << "\t**Test #2: Get/Set Compute Partition (new functionality) ===============\n"; + std::cout << "\t**======================================================================\n"; + } + + // TEST 2: Set/Get Compute Partition (new functionality) + initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind != 0) { + std::cout << "\n"; + } + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t**======================================================================\n"; + std::cout << "\t**Test #2: Get/Set Compute Partition (new functionality) ===============\n"; + std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind + << " ==========================================================\n"; + std::cout << "\t**======================================================================\n"; + } + // waitForUserInput(); // watch for any errors + PrintDeviceHeader(processor_handles_[dv_ind]); + amdsmi_accelerator_partition_profile_t profile = {}; + uint32_t partition_id[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + ret = amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[dv_ind], + &profile, &partition_id[0]); + std::string nps_caps_str = ""; + if ((profile.memory_caps.nps_flags.nps1_cap == 0 + && profile.memory_caps.nps_flags.nps2_cap == 0 + && profile.memory_caps.nps_flags.nps4_cap == 0 + && profile.memory_caps.nps_flags.nps8_cap == 0)) { + nps_caps_str = "N/A"; + } else { + nps_caps_str.clear(); + if (profile.memory_caps.nps_flags.nps1_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1"; + } + if (profile.memory_caps.nps_flags.nps2_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2"; + } + if (profile.memory_caps.nps_flags.nps4_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4"; + } + if (profile.memory_caps.nps_flags.nps8_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8"; + } + } + + std::string profile_type_str = "N/A"; + if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + profile_type_str = "SPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + profile_type_str = "DPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + profile_type_str = "TPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + profile_type_str = "QPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + profile_type_str = "CPX"; + } + + std::string partition_id_str = ""; + for (int i = 0; i < 8; i++) { + partition_id_str += std::to_string(partition_id[i]); + if (i < 7) { + partition_id_str += ", "; + } + + switch (profile.profile_type) { + case AMDSMI_ACCELERATOR_PARTITION_SPX: + EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_DPX: + EXPECT_LT(partition_id[i], MAX_DPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_TPX: + EXPECT_LT(partition_id[i], MAX_TPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_QPX: + EXPECT_LT(partition_id[i], MAX_QPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_CPX: { + uint16_t num_xcd; + uint32_t max_xcps = 0; + ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd); + if (ret == AMDSMI_STATUS_SUCCESS) { + max_xcps = static_cast(num_xcd); + } + EXPECT_LT(partition_id[i], max_xcps); + break; + } + case AMDSMI_ACCELERATOR_PARTITION_INVALID: + EXPECT_EQ(partition_id[i], MAX_UNSUPPORTED_PARTITIONS); + break; + default: + EXPECT_EQ(partition_id[i], MAX_UNSUPPORTED_PARTITIONS); + break; + } + } + + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret, false) + << "\n\t**Current profile.profile_type: " + << profile_type_str + << "\n\t**profile.num_partitions: " + << (profile.num_partitions == kMAX_UINT32 + ? "N/A" : std::to_string(profile.num_partitions)) + << "\n\t**profile.memory_caps: " + << nps_caps_str + << "\n\t**profile.profile_index: " + << (profile.profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(profile.profile_index)) + << "\n\t**profile.num_resources: " + << profile.num_resources + << "\n\t**partition_id: " + << partition_id_str + << std::endl; + } + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + amdsmi_accelerator_partition_profile_config_t profile_config = {}; + ret = amdsmi_get_gpu_accelerator_partition_profile_config(processor_handles_[dv_ind], + &profile_config); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_accelerator_partition_profile_config(processor_handles_[" + << dv_ind << "], &profile_config):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret, false) + << "\n\t**profile_config.num_profiles: " + << profile_config.num_profiles + << "\n\t**profile_config.num_resource_profiles: " + << profile_config.num_resource_profiles + << std::endl; + } + AcceleratorProfileConfig original_profile_config = {}; + original_profile_config + = getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose); + // waitForUserInput(); // watch for any errors + + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Checking invalid profile Set ============================\n"; + std::cout << "\t**=========================================================\n"; + } + // Test setting invalid profile index + auto ret_expect_invalid = amdsmi_set_gpu_accelerator_partition_profile( + processor_handles_[dv_ind], + profile_config.num_profiles); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], " << profile_config.num_profiles << "):" + << "\n\t\t" << smi_amdgpu_get_status_string(ret_expect_invalid, false) + << std::endl; + } + EXPECT_TRUE(ret_expect_invalid == AMDSMI_STATUS_INVAL + || ret_expect_invalid == AMDSMI_STATUS_NOT_SUPPORTED); + + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Checking valid profile Sets =============================\n"; + std::cout << "\t**=========================================================\n"; + } + int resource_index = 0; + for (uint32_t i = 0; i < profile_config.num_profiles; i++) { + auto current_profile = profile_config.profiles[i]; + std::string profile_type_str = "N/A"; + if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + profile_type_str = "SPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + profile_type_str = "DPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + profile_type_str = "TPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + profile_type_str = "QPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + profile_type_str = "CPX"; + } + + std::string nps_caps_str = ""; + if ((current_profile.memory_caps.nps_flags.nps1_cap == 0 + && current_profile.memory_caps.nps_flags.nps2_cap == 0 + && current_profile.memory_caps.nps_flags.nps4_cap == 0 + && current_profile.memory_caps.nps_flags.nps8_cap == 0)) { + nps_caps_str = "N/A"; + } else { + nps_caps_str.clear(); + if (current_profile.memory_caps.nps_flags.nps1_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1"; + } + if (current_profile.memory_caps.nps_flags.nps2_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2"; + } + if (current_profile.memory_caps.nps_flags.nps4_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4"; + } + if (current_profile.memory_caps.nps_flags.nps8_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8"; + } + } + IF_VERB(STANDARD) { + std::cout << "\t**profile_config.profiles[" << i << "]:\n" + << "\t\tprofile_type: " << profile_type_str + << "\n\t\tnum_partitions: " << current_profile.num_partitions + << "\n\t\tmemory_caps: " << nps_caps_str + << "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources + << std::endl; + } + for (uint32_t j = 0; j < current_profile.num_resources; j++) { + auto rp = profile_config.resource_profiles[resource_index]; + + IF_VERB(STANDARD) { + std::cout << "\n\t\t\tprofile_index: " << current_profile.profile_index + << "\n\t\t\tresource_index: " << resource_index + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].resource_type: " + << getResourceType(rp.resource_type) + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].partition_resource: " + << rp.partition_resource + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].num_partitions_share_resource: " + << rp.num_partitions_share_resource + << std::endl; + } + resource_index++; + } + } + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config: " + << "Not supported on this machine" << std::endl; + } + continue; + } + + for (uint32_t config = 0; config < profile_config.num_profiles; config++) { + auto new_profile = profile_config.profiles[config]; + std::string new_profile_type_str = "N/A"; + if (new_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + new_profile_type_str = "SPX"; + } else if (new_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + new_profile_type_str = "DPX"; + } else if (new_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + new_profile_type_str = "TPX"; + } else if (new_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + new_profile_type_str = "QPX"; + } else if (new_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + new_profile_type_str = "CPX"; + } + + IF_VERB(STANDARD) { + std::cout << "\t**" + << "======== TEST AMDSMI_ACCELERATOR_PARTITION_" + << new_profile_type_str << " (profile_index: " + << profile_config.profiles[config].profile_index << ")" + << " ===============" << std::endl; + } + // waitForUserInput(); // watch for any errors + + auto ret_set = amdsmi_set_gpu_accelerator_partition_profile( + processor_handles_[dv_ind], + profile_config.profiles[config].profile_index); + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], " << profile_type_str << "): " + << smi_amdgpu_get_status_string(ret_set, false) << "\n" + << "\t**Requested Set Partition: " + << new_profile_type_str << "\n" + << "\t**Original Partition: " + << original_profile_config.original_profile_type_str + << std::endl; + } + EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE + || ret_set== AMDSMI_STATUS_NO_PERM + || ret_set == AMDSMI_STATUS_SUCCESS + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED + || ret_set == AMDSMI_STATUS_INVAL); + if (ret_set == AMDSMI_STATUS_INVAL) { + std::cout << "\t**" + << "2nd Test: Due to invalid args, skipping rest of test for this device." + << "\n\t Device might be in a static partition mode. " + << "With inability to change partition modes." + << std::endl; + break; + } + if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_accelerator_partition_profile: " + << "Not supported on this machine" << std::endl; + } + continue; + } + + auto ret_get = amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[dv_ind], + &profile, &partition_id[0]); + if (ret_get == AMDSMI_STATUS_SUCCESS && ret_set == AMDSMI_STATUS_SUCCESS) { + profile_type_str = partition_types_map.at(profile.profile_type); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "]," + << "\n\t\t" << profile_config.profiles[config].profile_index + << " (AMDSMI_ACCELERATOR_PARTITION_" << new_profile_type_str + << "): " + << "\n\t\t" << smi_amdgpu_get_status_string(ret_set, false) + << "\n\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret_get, false) + << "\n\t**Current profile.profile_type: " + << profile_type_str + << "\n\t**profile.num_partitions: " + << (profile.num_partitions == kMAX_UINT32 + ? "N/A" : std::to_string(profile.num_partitions)) + << "\n\t**profile.profile_index: " + << (profile.profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(profile.profile_index)) + << std::endl; + } + EXPECT_STREQ(partition_types_map.at(profile.profile_type).c_str(), + new_profile_type_str.c_str()); + EXPECT_EQ(profile.profile_type, new_profile.profile_type); + EXPECT_EQ(profile.profile_index, new_profile.profile_index); + } + } + IF_VERB(STANDARD) { + std::cout << "\t**Device Index: " << dv_ind << std::endl + << "\t**======== Return to original AMDSMI_ACCELERATOR_PARTITION_" + << original_profile_config.original_profile_type_str << " (profile_index: " + << (original_profile_config.original_profile_index == kMAX_UINT32 ? + "N/A" : std::to_string(original_profile_config.original_profile_index)) << ")" + << " ===============" << std::endl; + } + auto ret_set = amdsmi_set_gpu_accelerator_partition_profile( + processor_handles_[dv_ind], + original_profile_config.original_profile_index); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "]," + << "\n\t\t" << original_profile_config.original_profile_index + << " (AMDSMI_ACCELERATOR_PARTITION_" + << original_profile_config.original_profile_type_str + << "): " + << "\n\t\t" << smi_amdgpu_get_status_string(ret_set, false) + << std::endl; + } + EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE + || ret_set== AMDSMI_STATUS_NO_PERM + || ret_set == AMDSMI_STATUS_SUCCESS + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED + || ret_set == AMDSMI_STATUS_INVAL); + auto ret_get = amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[dv_ind], + &profile, &partition_id[0]); + IF_VERB(STANDARD) { + std::cout << "\n\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret_get, false) + << std::endl; + } + + // older kernels do not support this feature + if (original_profile_config.original_profile_index == kMAX_UINT32) { + EXPECT_EQ(ret_get, AMDSMI_STATUS_NOT_SUPPORTED); + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile: " + << "Not supported on this machine, skipping remaining tests." << std::endl; + } + break; + } + + if (ret_get == AMDSMI_STATUS_SUCCESS && ret_set == AMDSMI_STATUS_SUCCESS) { + profile_type_str = partition_types_map.at(profile.profile_type); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "]," + << "\n\t\t" << original_profile_config.original_profile_index + << " (AMDSMI_ACCELERATOR_PARTITION_" + << original_profile_config.original_profile_type_str + << "): " + << "\n\t\t" << smi_amdgpu_get_status_string(ret_set, false) + << "\n\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret_get, false) + << "\n\t**Current profile.profile_type: " + << profile_type_str + << "\n\t**profile.num_partitions: " + << (profile.num_partitions == kMAX_UINT32 + ? "N/A" : std::to_string(profile.num_partitions)) + << "\n\t**profile.profile_index: " + << (profile.profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(profile.profile_index)) + << std::endl; + } + EXPECT_STREQ(partition_types_map.at(profile.profile_type).c_str(), + original_profile_config.original_profile_type_str.c_str()); + EXPECT_EQ(profile.profile_type, original_profile_config.original_profile_type); + EXPECT_EQ(profile.profile_index, original_profile_config.original_profile_index); + } + } // END for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) + + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t**======================================================================\n"; + std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n"; + std::cout << "\t**======================================================================\n"; + } + // waitForUserInput(); // watch for any errors on going back to original partition + + // ---------------------------------------------------------// + // TEST 3: Check fluctuating # of devices & partition IDs // + // ---------------------------------------------------------// + initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind != 0) { + std::cout << "\n"; + } + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t**======================================================================\n"; + std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n"; + std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind + << " ========================================================\n"; + std::cout << "\t**======================================================================\n"; + } + // Leaving for debug purposes + // waitForUserInput(); // watch for any errors on going back to original partition + uint32_t device_index = 0; + amdsmi_processor_handle p_handle = {}; + uint32_t current_num_devices = 0; + smi_amdgpu_get_device_count(¤t_num_devices); + smi_amdgpu_get_processor_handle_by_index(dv_ind, &p_handle); + smi_amdgpu_get_device_index(p_handle, &device_index); + IF_VERB(STANDARD) { + std::cout << "\t=========== START INDEX/p_handle DEVICE INFO 1 ===============\n"; + std::cout << "\t**Dv_ind: " << dv_ind << std::endl; + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Processor Handle (processor_handles_[dv_ind]): " + << processor_handles_[dv_ind] << std::endl; + std::cout << "\t**Processor Handle: " << p_handle << std::endl; + std::cout << "\t**Current # of devices: " << current_num_devices << std::endl; + std::cout << "\t=========== END INDEX/p_handle DEVICE INFO 1 =============\n"; + } + + + PrintDeviceHeader(p_handle); + ret = amdsmi_get_gpu_compute_partition(p_handle, orig_char_computePartition, + k255Len); + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_compute_partition: " + << "Not supported on this machine" << std::endl; + } + continue; + } + for (int partition = static_cast(mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + partition <= static_cast(AMDSMI_COMPUTE_PARTITION_CPX); + partition++) { + uint32_t device_index2 = 0; + amdsmi_processor_handle p_handle2 = {}; + smi_amdgpu_get_device_count(¤t_num_devices); + smi_amdgpu_get_processor_handle_by_index(dv_ind, &p_handle2); + smi_amdgpu_get_device_index(p_handle2, &device_index2); + IF_VERB(STANDARD) { + std::cout << "\t=========== INDEX/p_handle DEVICE INFO 2 ===============\n"; + std::cout << "\t**Dv_ind: " << dv_ind << std::endl; + std::cout << "\t**Device Index2: " << device_index2 << std::endl; + std::cout << "\t**Processor Handle (processor_handles_[dv_ind]): " + << processor_handles_[dv_ind] << std::endl; + std::cout << "\t**Processor Handle: " << p_handle << std::endl; + std::cout << "\t**Processor Handle2: " << p_handle2 << std::endl; + std::cout << "\t**Current # of devices: " << current_num_devices << std::endl; + std::cout << "\t=========== END INDEX/p_handle DEVICE INFO 2 =============\n"; + } + + amdsmi_compute_partition_type_t updatePartition + = static_cast(partition); + auto ret_set = amdsmi_set_gpu_compute_partition(p_handle2, updatePartition); + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_compute_partition(processor_handles_[" + << dv_ind << "], " << computePartitionString(updatePartition) << "): " + << smi_amdgpu_get_status_string(ret_set, false) << "\n" + << "\t**Requested Set Partition: " + << computePartitionString(updatePartition) << "\n" + << "\t**Original Partition: " << orig_char_computePartition + << std::endl; + } + EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE + || ret_set== AMDSMI_STATUS_NO_PERM + || ret_set == AMDSMI_STATUS_SUCCESS + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED + || ret_set == AMDSMI_STATUS_INVAL); + if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_compute_partition: " + << "Not supported on this machine" << std::endl; + } + continue; + } + if (ret_set == AMDSMI_STATUS_INVAL) { + std::cout << "\t**" + << "3rd Test: Due to invalid args, skipping rest of test for this device." + << "\n\t Device might be in a static partition mode. " + << "With inability to change partition modes." + << std::endl; + break; + } + + ret = amdsmi_get_gpu_compute_partition(p_handle2, + current_char_computePartition, + k255Len); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_compute_partition(processor_handles_[" << dv_ind << "], " + << current_char_computePartition << "): " + << smi_amdgpu_get_status_string(ret, false) + << "\n\t**Current Partition (get): " + << current_char_computePartition + << std::endl; + } + if (ret_set == AMDSMI_STATUS_SUCCESS) { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + EXPECT_EQ(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(current_char_computePartition))); + checkPartitionIdChanges(processor_handles_, dv_ind, + std::string(current_char_computePartition), + isVerbose, true); + } else { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(current_char_computePartition))); + } + // waitForUserInput(); // watch for any errors on going back to original partition + } + + uint32_t device_index3 = 0; + amdsmi_processor_handle p_handle3 = {}; + smi_amdgpu_get_processor_handle_by_index(dv_ind, &p_handle3); + smi_amdgpu_get_device_index(p_handle3, &device_index3); + + amdsmi_compute_partition_type_t updatePartition = + static_cast( + mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + IF_VERB(STANDARD) { + std::cout << "\t**ABOUT TO GO BACK TO ORIGINAL PARTITION (" + << orig_char_computePartition << ")\n"; + } + // waitForUserInput(); // watch for any errors on going back to original partition + auto ret_set = amdsmi_set_gpu_compute_partition(p_handle3, updatePartition); + checkPartitionIdChanges(processor_handles_, dv_ind, std::string(orig_char_computePartition), + isVerbose, true); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + EXPECT_EQ(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + } else { + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS); + // on guest this means we can't change partitions + // some partitions will match the original partition + if (amd::smi::is_vm_guest()) { + EXPECT_EQ(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + } else { + EXPECT_EQ(updatePartition, mapStringToSMIComputePartitionTypes.at( + std::string(orig_char_computePartition))); + } + } + IF_VERB(STANDARD) { + std::cout << "\t**Get/Set Test #3 (dev_ind: " + << dv_ind << "): Check fluctuating # of devices & partition IDs ===============\n"; + } + } + + IF_VERB(STANDARD) { + std::cout << "\n"; + std::cout << "\t**======================================================================\n"; + std::cout << "\t**END Tests ============================================================\n"; + std::cout << "\t**======================================================================\n"; + } +} diff --git a/tests/amd_smi_test/functional/computepartition_read_write.h b/tests/amd_smi_test/functional/computepartition_read_write.h new file mode 100755 index 0000000000..5db5e7bd44 --- /dev/null +++ b/tests/amd_smi_test/functional/computepartition_read_write.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ +#define TESTS_AMD_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ + +#include "../test_base.h" + +class TestComputePartitionReadWrite : public TestBase { + public: + TestComputePartitionReadWrite(); + + // @Brief: Destructor for test case of TestComputePartitionReadWrite + virtual ~TestComputePartitionReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ diff --git a/tests/amd_smi_test/functional/gpu_metrics_read.cc b/tests/amd_smi_test/functional/gpu_metrics_read.cc index 93f1041032..154b79a554 100644 --- a/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -35,6 +35,7 @@ #include "gpu_metrics_read.h" #include "../test_common.h" #include "rocm_smi/rocm_smi_utils.h" +#include "amd_smi/impl/amd_smi_utils.h" TestGpuMetricsRead::TestGpuMetricsRead() : TestBase() { @@ -101,6 +102,15 @@ void TestGpuMetricsRead::Run(void) { } } } else { + auto temp_xcd_counter_value = uint16_t(0); + auto ret_xcd = amdsmi_get_gpu_xcd_counter(processor_handles_[i], &temp_xcd_counter_value); + IF_VERB(STANDARD) { + std::cout << "\t\t** amdsmi_get_gpu_xcd_counter(): " + << smi_amdgpu_get_status_string(ret_xcd, false) + << "\n\t\t** XCD Counter Value: " + << temp_xcd_counter_value + << "\n"; + } CHK_ERR_ASRT(err); IF_VERB(STANDARD) { std::cout << "METRIC TABLE HEADER:\n"; @@ -380,13 +390,5 @@ void TestGpuMetricsRead::Run(void) { amdsmi_status_code_to_string(err, &status_string); std::cout << "\t\t** amdsmi_get_gpu_metrics_info(nullptr check): " << status_string << "\n"; ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - - - // TODO(AMD_SMI_team): add xcd_counter_get for amd smi - // auto temp_xcd_counter_value = uint16_t(0); - // err = rsmi_dev_metrics_xcd_counter_get(i, &temp_xcd_counter_value); - // if (err != RSMI_STATUS_NOT_SUPPORTED) { - // CHK_ERR_ASRT(err); - // } } } diff --git a/tests/amd_smi_test/functional/id_info_read.cc b/tests/amd_smi_test/functional/id_info_read.cc index 4ede24beed..d41a6b3f33 100644 --- a/tests/amd_smi_test/functional/id_info_read.cc +++ b/tests/amd_smi_test/functional/id_info_read.cc @@ -22,11 +22,11 @@ #include #include +#include #include #include - -#include +#include #include "amd_smi/amdsmi.h" #include "id_info_read.h" #include "../test_common.h" @@ -63,6 +63,15 @@ void TestIdInfoRead::Close() { static const uint32_t kBufferLen = 80; +static const std::map< amdsmi_virtualization_mode_t, std::string> + virtualization_mode_map = { + {AMDSMI_VIRTUALIZATION_MODE_UNKNOWN, "UNKNOWN"}, + {AMDSMI_VIRTUALIZATION_MODE_BAREMETAL, "BAREMETAL"}, + { AMDSMI_VIRTUALIZATION_MODE_HOST, "HOST"}, + { AMDSMI_VIRTUALIZATION_MODE_GUEST, "GUEST"}, + {AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH, "PASSTHROUGH"} +}; + void TestIdInfoRead::Run(void) { amdsmi_status_t err; uint16_t id; @@ -227,5 +236,20 @@ void TestIdInfoRead::Run(void) { // Verify api support checking functionality is working err = amdsmi_get_gpu_bdf_id(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + + // Verify api support checking functionality is working + err = amdsmi_get_gpu_virtualization_mode(processor_handles_[i], nullptr); + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + amdsmi_virtualization_mode_t vmode; + err = amdsmi_get_gpu_virtualization_mode(processor_handles_[i], &vmode); + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); + IF_VERB(STANDARD) { + auto it = virtualization_mode_map.find(vmode); + if (it != virtualization_mode_map.end()) { + std::cout << "\t**Virtualization Mode: " << it->second << std::endl; + } else { + std::cout << "\t**Virtualization Mode: MAP TYPE UNKNOWN?" << std::endl; + } + } } } diff --git a/tests/amd_smi_test/functional/memorypartition_read_write.cc b/tests/amd_smi_test/functional/memorypartition_read_write.cc new file mode 100755 index 0000000000..55dfee2286 --- /dev/null +++ b/tests/amd_smi_test/functional/memorypartition_read_write.cc @@ -0,0 +1,744 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "../test_base.h" +#include "../test_common.h" +#include "amd_smi/amdsmi.h" +#include "amd_smi/impl/amd_smi_utils.h" +#include "memorypartition_read_write.h" + +const uint32_t MAX_UNSUPPORTED_PARTITIONS = 0; +const uint32_t MAX_SPX_PARTITIONS = 1; // Single GPU node +const uint32_t MAX_DPX_PARTITIONS = 2; +const uint32_t MAX_TPX_PARTITIONS = 3; +const uint32_t MAX_QPX_PARTITIONS = 4; + +TestMemoryPartitionReadWrite::TestMemoryPartitionReadWrite() : TestBase() { + set_title("AMDSMI Memory Partition Read Test"); + set_description("The memory partition tests verifies that the memory " + "partition settings can be read and updated properly."); +} + +TestMemoryPartitionReadWrite::~TestMemoryPartitionReadWrite(void) { +} + +void TestMemoryPartitionReadWrite::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestMemoryPartitionReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestMemoryPartitionReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestMemoryPartitionReadWrite::Close() { + // This will close handles opened within rsmitst utility calls and call + // amdsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +static const std::string +memoryPartitionString(amdsmi_memory_partition_type_t memoryPartitionType) { + switch (memoryPartitionType) { + case AMDSMI_MEMORY_PARTITION_NPS1: + return "NPS1"; + case AMDSMI_MEMORY_PARTITION_NPS2: + return "NPS2"; + case AMDSMI_MEMORY_PARTITION_NPS4: + return "NPS4"; + case AMDSMI_MEMORY_PARTITION_NPS8: + return "NPS8"; + default: + return "UNKNOWN"; + } +} + +static const std::map +mapStringToRSMIMemoryPartitionTypes { + {"NPS1", AMDSMI_MEMORY_PARTITION_NPS1}, + {"NPS2", AMDSMI_MEMORY_PARTITION_NPS2}, + {"NPS4", AMDSMI_MEMORY_PARTITION_NPS4}, + {"NPS8", AMDSMI_MEMORY_PARTITION_NPS8} +}; + +void TestMemoryPartitionReadWrite::Run(void) { + amdsmi_status_t ret, err, ret_set; + constexpr uint32_t k255Len = 255; + constexpr uint32_t k0Len = 0; + char orig_memory_partition[k255Len]; + char current_memory_partition[k255Len]; + orig_memory_partition[0] = '\0'; + current_memory_partition[0] = '\0'; + amdsmi_memory_partition_config_t current_memory_config; + const uint32_t kMAX_UINT32 = std::numeric_limits::max(); + std::map orig_dev_config; // index, ProfileConfig + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + + bool isVerbose = (this->verbosity() && + this->verbosity() >= (this->TestBase::VERBOSE_STANDARD)) ? true: false; + + // Save original memory partition settings (see orig_dev_config ^) + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Save Original Compute Partition Settings ================\n"; + std::cout << "\t**=========================================================\n"; + } + auto initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind != 0) { + std::cout << "\n"; + } + PrintDeviceHeader(processor_handles_[dv_ind]); + amdsmi_accelerator_partition_profile_t profile = {}; + uint32_t partition_id[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + ret = amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[dv_ind], + &profile, &partition_id[0]); + std::string nps_caps_str = ""; + if ((profile.memory_caps.nps_flags.nps1_cap == 0 + && profile.memory_caps.nps_flags.nps2_cap == 0 + && profile.memory_caps.nps_flags.nps4_cap == 0 + && profile.memory_caps.nps_flags.nps8_cap == 0)) { + nps_caps_str = "N/A"; + } else { + nps_caps_str.clear(); + if (profile.memory_caps.nps_flags.nps1_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1"; + } + if (profile.memory_caps.nps_flags.nps2_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2"; + } + if (profile.memory_caps.nps_flags.nps4_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4"; + } + if (profile.memory_caps.nps_flags.nps8_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8"; + } + } + + std::string profile_type_str = "N/A"; + if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + profile_type_str = "SPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + profile_type_str = "DPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + profile_type_str = "TPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + profile_type_str = "QPX"; + } else if (profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + profile_type_str = "CPX"; + } + + std::string partition_id_str = ""; + for (int i = 0; i < 8; i++) { + partition_id_str += std::to_string(partition_id[i]); + if (i < 7) { + partition_id_str += ", "; + } + + switch (profile.profile_type) { + case AMDSMI_ACCELERATOR_PARTITION_SPX: + EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_DPX: + EXPECT_LT(partition_id[i], MAX_DPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_TPX: + EXPECT_LT(partition_id[i], MAX_TPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_QPX: + EXPECT_LT(partition_id[i], MAX_QPX_PARTITIONS); + break; + case AMDSMI_ACCELERATOR_PARTITION_CPX: { + uint16_t num_xcd; + uint32_t max_xcps = 0; + ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd); + if (ret == AMDSMI_STATUS_SUCCESS) { + max_xcps = static_cast(num_xcd); + } + EXPECT_LT(partition_id[i], max_xcps); + break; + } + case AMDSMI_ACCELERATOR_PARTITION_INVALID: + EXPECT_EQ(partition_id[i], MAX_UNSUPPORTED_PARTITIONS); + break; + default: + EXPECT_EQ(partition_id[i], MAX_UNSUPPORTED_PARTITIONS); + break; + } + } + + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret, false) + << "\n\t**Current profile.profile_type: " + << profile_type_str + << "\n\t**profile.num_partitions: " + << (profile.num_partitions == kMAX_UINT32 + ? "N/A" : std::to_string(profile.num_partitions)) + << "\n\t**profile.memory_caps: " + << nps_caps_str + << "\n\t**profile.profile_index: " + << (profile.profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(profile.profile_index)) + << "\n\t**profile.num_resources: " + << profile.num_resources + << "\n\t**partition_id: " + << partition_id_str + << std::endl; + } + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + amdsmi_accelerator_partition_profile_config_t profile_config = {}; + ret = amdsmi_get_gpu_accelerator_partition_profile_config(processor_handles_[dv_ind], + &profile_config); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_accelerator_partition_profile_config(processor_handles_[" + << dv_ind << "], &profile_config):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret, false) + << "\n\t**profile_config.num_profiles: " + << profile_config.num_profiles + << "\n\t**profile_config.num_resource_profiles: " + << profile_config.num_resource_profiles + << std::endl; + } + AcceleratorProfileConfig original_profile_config = + getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose); + orig_dev_config[dv_ind] = original_profile_config; + // waitForUserInput(); // watch for any errors + + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Checking valid profile Sets =============================\n"; + std::cout << "\t**=========================================================\n"; + } + int resource_index = 0; + for (uint32_t i = 0; i < profile_config.num_profiles; i++) { + auto current_profile = profile_config.profiles[i]; + std::string profile_type_str = "N/A"; + if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) { + profile_type_str = "SPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) { + profile_type_str = "DPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) { + profile_type_str = "TPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) { + profile_type_str = "QPX"; + } else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) { + profile_type_str = "CPX"; + } + + std::string nps_caps_str = ""; + if ((current_profile.memory_caps.nps_flags.nps1_cap == 0 + && current_profile.memory_caps.nps_flags.nps2_cap == 0 + && current_profile.memory_caps.nps_flags.nps4_cap == 0 + && current_profile.memory_caps.nps_flags.nps8_cap == 0)) { + nps_caps_str = "N/A"; + } else { + nps_caps_str.clear(); + if (current_profile.memory_caps.nps_flags.nps1_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1"; + } + if (current_profile.memory_caps.nps_flags.nps2_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2"; + } + if (current_profile.memory_caps.nps_flags.nps4_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4"; + } + if (current_profile.memory_caps.nps_flags.nps8_cap) { + (nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8"; + } + } + IF_VERB(STANDARD) { + std::cout << "\t**profile_config.profiles[" << i << "]:\n" + << "\t\tprofile_type: " << profile_type_str + << "\n\t\tnum_partitions: " << current_profile.num_partitions + << "\n\t\tmemory_caps: " << nps_caps_str + << "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources + << std::endl; + } + for (auto j = 0; j < current_profile.num_resources; j++) { + auto rp = profile_config.resource_profiles[resource_index]; + + IF_VERB(STANDARD) { + std::cout << "\n\t\t\tprofile_index: " << current_profile.profile_index + << "\n\t\t\tresource_index: " << resource_index + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].resource_type: " + << getResourceType(rp.resource_type) + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].partition_resource: " + << rp.partition_resource + << "\n\t\t\tprofile_config.resource_profiles[" << resource_index + << "].num_partitions_share_resource: " + << rp.num_partitions_share_resource + << std::endl; + } + resource_index++; + } + } + EXPECT_TRUE(ret == AMDSMI_STATUS_SUCCESS + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config: " + << "Not supported on this machine" << std::endl; + } + continue; + } + } + + // Run memory partition tests + uint32_t current_num_devices = 0; + smi_amdgpu_get_device_count(¤t_num_devices); + + IF_VERB(STANDARD) { + std::cout << "\t**Total Num Devices: " << current_num_devices << std::endl; + } + // Leaving for debug purposes - uncomment to test a specific number of devices + // uint32_t num_devices_to_test = promptNumDevicesToTest(current_num_devices); + uint32_t num_devices_to_test = current_num_devices; + for (uint32_t dv_ind = 0; dv_ind < num_devices_to_test; ++dv_ind) { + bool wasSetSuccess = false; + if (dv_ind != 0) { + IF_VERB(STANDARD) { + std::cout << std::endl; + } + } + PrintDeviceHeader(processor_handles_[dv_ind]); + + // Standard checks to see if API is supported, before running full tests + ret = amdsmi_get_gpu_memory_partition( + processor_handles_[dv_ind], orig_memory_partition, k255Len); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "Not supported on this machine" << std::endl; + } + continue; + } else { + CHK_ERR_ASRT(ret) + } + IF_VERB(STANDARD) { + std::cout << std::endl << "\t**Current Memory Partition: " + << orig_memory_partition << std::endl; + } + + if ((orig_memory_partition == nullptr) || + (orig_memory_partition[0] == '\0')) { + std::cout << "***System memory partition value is not defined or received" + " unexpected data. Skip memory partition test." << std::endl; + continue; + } + ASSERT_TRUE(ret == AMDSMI_STATUS_SUCCESS); + + // Verify api support checking functionality is working + constexpr uint32_t k2Len = 2; + char smallBuffer[k2Len]; + err = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind], smallBuffer, k2Len); + uint32_t size = static_cast(sizeof(smallBuffer)/sizeof(*smallBuffer)); + ASSERT_EQ(err, AMDSMI_STATUS_INSUFFICIENT_SIZE); + ASSERT_EQ(k2Len, size); + if (err == AMDSMI_STATUS_INSUFFICIENT_SIZE) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed AMDSMI_STATUS_INSUFFICIENT_SIZE was returned " + << "and size is 2, as requested." << std::endl; + } + } + + // Verify api support checking functionality is working + err = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind], nullptr, k255Len); + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_memory_partition(processor_handles_[" << dv_ind << "], " + << "nullptr, 255): " + << "Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } + } + + err = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], nullptr); + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_memory_partition(processor_handles_[" << dv_ind + << "], nullptr): Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } + } + + // Verify api support checking functionality is working + err = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind], orig_memory_partition, k0Len); + ASSERT_TRUE(err == AMDSMI_STATUS_INVAL); + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_memory_partition(processor_handles_[" << dv_ind << "], " + << "orig_memory_partition, 0): " + << "Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } + } + + amdsmi_memory_partition_config_t* null_memory_partition_config = nullptr; + err = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], + null_memory_partition_config); + ASSERT_TRUE((err == AMDSMI_STATUS_INVAL) || + (err == AMDSMI_STATUS_NOT_SUPPORTED)); + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "amdsmi_get_gpu_memory_partition_config(processor_handles_[" << dv_ind << "], " + << "nullptr): " + << "Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } + } + + /****************************************/ + /* amdsmi_set_gpu_memory_partition(...) */ + /****************************************/ + // Verify api support checking functionality is working + amdsmi_memory_partition_type_t null_memory_partition = {}; + err = amdsmi_set_gpu_memory_partition_mode(processor_handles_[dv_ind], null_memory_partition); + std::cout << "\t**amdsmi_set_gpu_memory_partition(amdsmi_set_gpu_memory_partition_mode" + << "(processor_handles_[" << dv_ind << "], nullptr): " + << smi_amdgpu_get_status_string(err, false) << "\n"; + // Note: new_memory_partition is not set + ASSERT_TRUE(err == AMDSMI_STATUS_INVAL); + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } + } else if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "amdsmi_set_gpu_memory_partition_mode not supported on this " + << "device\n\t (if amdsmi_get_gpu_memory_partition works, " + << "then likely need to set in bios)" + << std::endl; + } + continue; + } else { + DISPLAY_AMDSMI_ERR(err) + } + ASSERT_FALSE(err == AMDSMI_STATUS_NO_PERM); + + // Verify api support checking functionality is working + amdsmi_memory_partition_type_t new_memory_partition = AMDSMI_MEMORY_PARTITION_UNKNOWN; + err = amdsmi_set_gpu_memory_partition_mode(processor_handles_[dv_ind], new_memory_partition); + ASSERT_TRUE((err == AMDSMI_STATUS_INVAL) || + (err == AMDSMI_STATUS_NOT_SUPPORTED) || + (err == AMDSMI_STATUS_NO_PERM)); + if (err == AMDSMI_STATUS_INVAL) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed AMDSMI_STATUS_INVAL was returned." + << std::endl; + } else if (err == AMDSMI_STATUS_NO_PERM) { + DISPLAY_AMDSMI_ERR(err) + // tests should not continue if err is a permission issue + ASSERT_FALSE(err == AMDSMI_STATUS_NO_PERM); + } else { + DISPLAY_AMDSMI_ERR(err) + } + } + + // Re-run original get, so we can reset to later + ret = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind], + orig_memory_partition, k255Len); + ASSERT_EQ(AMDSMI_STATUS_SUCCESS, ret); + + for (int partition = static_cast(AMDSMI_MEMORY_PARTITION_NPS1); + partition <= static_cast(AMDSMI_MEMORY_PARTITION_NPS8); + partition++) { + ret_set = AMDSMI_STATUS_NOT_SUPPORTED; + wasSetSuccess = false; + new_memory_partition = static_cast(partition); + if (new_memory_partition != AMDSMI_MEMORY_PARTITION_NPS1 + && new_memory_partition != AMDSMI_MEMORY_PARTITION_NPS2 + && new_memory_partition != AMDSMI_MEMORY_PARTITION_NPS4 + && new_memory_partition != AMDSMI_MEMORY_PARTITION_NPS8) { + continue; // skip unknown partition, this is already tested above ^ + } + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "======== TEST AMDSMI_MEMORY_PARTITION_" + << memoryPartitionString(new_memory_partition) + << " ===============" << std::endl; + } + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Attempting to set memory partition to: " + << memoryPartitionString(new_memory_partition) << std::endl; + } + + auto ret_caps = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], + ¤t_memory_config); + std::string memory_caps_str = "N/A"; + if (ret_caps == AMDSMI_STATUS_SUCCESS) { + memory_caps_str.clear(); + if (current_memory_config.partition_caps.nps_flags.nps1_cap) { + memory_caps_str += (memory_caps_str.empty() ? "NPS1" : ", NPS1"); + } + if (current_memory_config.partition_caps.nps_flags.nps2_cap) { + memory_caps_str += (memory_caps_str.empty() ? "NPS2" : ", NPS2"); + } + if (current_memory_config.partition_caps.nps_flags.nps4_cap) { + memory_caps_str += (memory_caps_str.empty() ? "NPS4" : ", NPS4"); + } + if (current_memory_config.partition_caps.nps_flags.nps8_cap) { + memory_caps_str += (memory_caps_str.empty() ? "NPS8" : ", NPS8"); + } + } + + IF_VERB(STANDARD) { + std::cout << "\t**" + << "amdsmi_get_gpu_memory_partition_config(processor_handles_[" << dv_ind + << "], current_memory_config): " + << smi_amdgpu_get_status_string(ret_caps, false) << std::endl; + std::cout << "\t**" << "Available Memory Partition Capabilities: " + << memory_caps_str << "\n" + << "\t**" << "current_memory_partition_mode: " + << memoryPartitionString(current_memory_config.mp_mode) << "\n" + << "\t**" << "num_numa_ranges: " + << current_memory_config.num_numa_ranges + << std::endl; + } + ASSERT_TRUE((ret_caps == AMDSMI_STATUS_NOT_SUPPORTED) || + (ret_caps == AMDSMI_STATUS_SUCCESS)); + + ret_set = amdsmi_set_gpu_memory_partition_mode(processor_handles_[dv_ind], + new_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_set_gpu_memory_partition_mode(processor_handles_[" + << dv_ind << "], " << memoryPartitionString(new_memory_partition) << "): " + << smi_amdgpu_get_status_string(ret_set, false) << "\n"; + } + if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "Not supported on this machine" << std::endl; + } + break; + } else { + ASSERT_TRUE((ret_set == AMDSMI_STATUS_SUCCESS) + || (ret_set == AMDSMI_STATUS_BUSY) + || (ret_set == AMDSMI_STATUS_AMDGPU_RESTART_ERR) + || (ret_set == AMDSMI_STATUS_INVAL) + || (ret_set == AMDSMI_STATUS_NOT_SUPPORTED)); + } + + if (ret_set == AMDSMI_STATUS_SUCCESS) { // do not continue trying to reset + wasSetSuccess = true; + } + + ret = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], + ¤t_memory_config); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_memory_partition_config(): " + << "Not supported on this machine" << std::endl; + } + continue; + } + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Current memory partition: " + << memoryPartitionString(current_memory_config.mp_mode) + << std::endl; + } + if (wasSetSuccess) { + ASSERT_EQ(AMDSMI_STATUS_SUCCESS, ret_set); + ASSERT_STREQ(memoryPartitionString(new_memory_partition).c_str(), + memoryPartitionString(current_memory_config.mp_mode).c_str()); + CHK_ERR_ASRT(ret_set) + } else { + ASSERT_NE(AMDSMI_STATUS_SUCCESS, ret_set); + ASSERT_STRNE(memoryPartitionString(new_memory_partition).c_str(), + memoryPartitionString(current_memory_config.mp_mode).c_str()); + } + } // END MEMORY PARTITION FOR LOOP + + /* TEST RETURN TO ORIGINAL MEMORY PARTITION SETTING */ + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "=========== TEST RETURN TO ORIGINAL MEMORY PARTITION " + << "SETTING (" << orig_memory_partition + << ") ========" << std::endl; + } + + ret = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], + ¤t_memory_config); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "amdsmi_get_gpu_memory_partition_config(processor_handles_[" << dv_ind + << "], current_memory_config): " + << smi_amdgpu_get_status_string(ret, false) << std::endl; + std::cout << "\t**" + << "Current memory partition: " + << memoryPartitionString(current_memory_config.mp_mode) + << std::endl; + } + + new_memory_partition + = mapStringToRSMIMemoryPartitionTypes.at(orig_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" << "Returning memory partition to: " + << memoryPartitionString(new_memory_partition) << std::endl; + } + ret = amdsmi_set_gpu_memory_partition(processor_handles_[dv_ind], new_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "amdsmi_set_gpu_memory_partition(processor_handles_[" << dv_ind + << "], " << orig_memory_partition << "): " + << smi_amdgpu_get_status_string(ret, false) << std::endl; + } + CHK_ERR_ASRT(ret) + ret = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind], + current_memory_partition, k255Len); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" << "Attempted to set memory partition: " + << memoryPartitionString(new_memory_partition) << std::endl + << "\t**" << "Current memory partition: " + << current_memory_partition + << std::endl; + } + ASSERT_EQ(AMDSMI_STATUS_SUCCESS, ret); + ASSERT_STREQ(orig_memory_partition, current_memory_partition); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed prior memory partition (" << orig_memory_partition + << ") is equal to current memory partition (" + << current_memory_partition << ")" << std::endl; + } + } // END DEVICE FOR LOOP + + // Restore original compute partition settings (see orig_dev_config ^) + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Restore Original Compute Partition Settings =============\n"; + std::cout << "\t**=========================================================\n"; + } + initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind != 0) { + std::cout << "\n"; + } + PrintDeviceHeader(processor_handles_[dv_ind]); + + AcceleratorProfileConfig original_profile_config = orig_dev_config[dv_ind]; + + // Return to original profile + IF_VERB(STANDARD) { + std::cout << "\t**Device Index: " << dv_ind << std::endl + << "\t**======== Return to original AMDSMI_ACCELERATOR_PARTITION_" + << original_profile_config.original_profile_type_str + << " (profile_index: " + << (original_profile_config.original_profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(original_profile_config.original_profile_index)) + << ")" + << " ===============" << std::endl; + } + auto ret_set = amdsmi_set_gpu_accelerator_partition_profile( + processor_handles_[dv_ind], + original_profile_config.original_profile_index); + EXPECT_TRUE((ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE) + || (ret_set== AMDSMI_STATUS_NO_PERM) + || (ret_set == AMDSMI_STATUS_SUCCESS) + || ret_set == AMDSMI_STATUS_BUSY + || ret_set == AMDSMI_STATUS_NOT_SUPPORTED); + amdsmi_accelerator_partition_profile_t profile = {}; + uint32_t partition_id[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + auto ret_get = amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[dv_ind], + &profile, &partition_id[0]); + if (ret_get == AMDSMI_STATUS_SUCCESS && ret_set == AMDSMI_STATUS_SUCCESS) { + std::string profile_type_str = partition_types_map.at(profile.profile_type); + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_set_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "]," + << "\n\t\t" << original_profile_config.original_profile_index + << " (AMDSMI_ACCELERATOR_PARTITION_" + << original_profile_config.original_profile_type_str + << "): " + << "\n\t\t" << smi_amdgpu_get_status_string(ret_set, false) + << "\n\t**amdsmi_get_gpu_accelerator_partition_profile(processor_handles_[" + << dv_ind << "], &profile, &partition_id[0]):\n" + << "\t\t" << smi_amdgpu_get_status_string(ret_get, false) + << "\n\t**Current profile.profile_type: " + << profile_type_str + << "\n\t**profile.num_partitions: " + << (profile.num_partitions == kMAX_UINT32 + ? "N/A" : std::to_string(profile.num_partitions)) + << "\n\t**profile.profile_index: " + << (profile.profile_index == kMAX_UINT32 + ? "N/A" : std::to_string(profile.profile_index)) + << std::endl; + } + EXPECT_STREQ(partition_types_map.at(profile.profile_type).c_str(), + original_profile_config.original_profile_type_str.c_str()); + EXPECT_EQ(profile.profile_type, original_profile_config.original_profile_type); + EXPECT_EQ(profile.profile_index, original_profile_config.original_profile_index); + } else { + IF_VERB(STANDARD) { + std::cout << "\t**Could not change or read profiles. " + << "Skipping return to original profile on this device." + << "\n\t**amdsmi_set_gpu_accelerator_partition_profile(): " + << smi_amdgpu_get_status_string(ret_set, false) + << "\n\t**amdsmi_get_gpu_accelerator_partition_profile(): " + << smi_amdgpu_get_status_string(ret_get, false) + << std::endl; + } + } + } +} diff --git a/tests/amd_smi_test/functional/memorypartition_read_write.h b/tests/amd_smi_test/functional/memorypartition_read_write.h new file mode 100755 index 0000000000..230d03572c --- /dev/null +++ b/tests/amd_smi_test/functional/memorypartition_read_write.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_MEMORYPARTITION_READ_WRITE_H_ +#define TESTS_AMD_SMI_TEST_FUNCTIONAL_MEMORYPARTITION_READ_WRITE_H_ + +#include "../test_base.h" + +class TestMemoryPartitionReadWrite : public TestBase { + public: + TestMemoryPartitionReadWrite(); + + // @Brief: Destructor for test case of TestMemoryPartitionReadWrite + virtual ~TestMemoryPartitionReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_MEMORYPARTITION_READ_WRITE_H_ diff --git a/tests/amd_smi_test/main.cc b/tests/amd_smi_test/main.cc index da26fcbab5..dac5f94ecf 100644 --- a/tests/amd_smi_test/main.cc +++ b/tests/amd_smi_test/main.cc @@ -64,6 +64,8 @@ #include "functional/version_read.h" #include "functional/mutual_exclusion.h" #include "functional/init_shutdown_refcount.h" +#include "functional/memorypartition_read_write.h" +#include "functional/computepartition_read_write.h" static AMDSMITstGlobals *sRSMIGlvalues = nullptr; @@ -250,8 +252,17 @@ TEST(amdsmitstReadOnly, TestMutualExclusion) { RunCustomTestEpilog(&tst); } */ -// TODO: add TestComputePartitionReadWrite -// TODO: add TestMemoryPartitionReadWrite + +TEST(amdsmitstReadWrite, TestComputePartitionReadWrite) { + TestComputePartitionReadWrite tst; + RunGenericTest(&tst); +} + +TEST(amdsmitstReadWrite, TestMemoryPartitionReadWrite) { + TestMemoryPartitionReadWrite tst; + RunGenericTest(&tst); +} + TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) { TestEvtNotifReadWrite tst; RunGenericTest(&tst); diff --git a/tests/amd_smi_test/test_base.cc b/tests/amd_smi_test/test_base.cc index c4c7fe96b3..564ea5dea6 100644 --- a/tests/amd_smi_test/test_base.cc +++ b/tests/amd_smi_test/test_base.cc @@ -20,12 +20,14 @@ * THE SOFTWARE. */ +#include #include +#include #include "amd_smi/amdsmi.h" +#include "amd_smi/impl/amd_smi_utils.h" #include "test_base.h" #include "test_common.h" -#include static const int kOutputLineLength = 80; static const char kLabelDelimiter[] = "####"; @@ -136,8 +138,21 @@ void TestBase::SetUp(uint64_t init_flags) { void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { amdsmi_status_t err; uint16_t val_ui16; + uint32_t val_ui32; amdsmi_asic_info_t info; + err = smi_amdgpu_get_device_count(&val_ui32); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Total Devices: " << val_ui32 << std::endl; + } + + err = smi_amdgpu_get_device_index(dv_ind, &val_ui32); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**AMD SMI Device index: " << val_ui32 << std::endl; + } + IF_VERB(STANDARD) { std::cout << "\t**Device handle: " << dv_ind << std::endl; } @@ -168,6 +183,15 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { } } + amdsmi_asic_info_t asic_info; + err = amdsmi_get_gpu_asic_info(dv_ind, &asic_info); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Market name: " << asic_info.market_name << std::endl; + std::cout << "\t**ASIC serial: 0x" << std::hex << asic_info.asic_serial << std::endl; + std::cout << "\t**Target GFX Version: gfx" << asic_info.target_graphics_version << std::endl; + } + err = amdsmi_get_gpu_subsystem_id(dv_ind, &val_ui16); CHK_ERR_ASRT(err) IF_VERB(STANDARD) { @@ -234,3 +258,137 @@ void TestBase::set_description(std::string d) { } } +TestBase::AcceleratorProfileConfig TestBase::getAvailableProfileConfigs( + uint32_t device_index, + amdsmi_accelerator_partition_profile_t current_profile, + amdsmi_accelerator_partition_profile_config_t config, + bool isVerbose) { + AcceleratorProfileConfig profile_config = {}; + profile_config.number_of_profiles = config.num_profiles; + profile_config.original_profile_type = current_profile.profile_type; + profile_config.original_profile_index = current_profile.profile_index; + profile_config.original_profile_type_str = + partition_types_map.at(current_profile.profile_type); + profile_config.available_profiles = std::vector( + config.num_profiles); + profile_config.available_profile_str = std::vector(config.num_profiles); + profile_config.available_profile_indices = std::vector(config.num_profiles); + for (uint32_t i = 0; i < config.num_profiles; i++) { + std::string profile_type_str = "N/A"; + profile_config.available_profiles[i] = config.profiles[i].profile_type; + profile_config.available_profile_str[i].clear(); + profile_config.available_profile_str[i] = + partition_types_map.at(config.profiles[i].profile_type); + profile_config.available_profile_indices[i] = config.profiles[i].profile_index; + } + + if (isVerbose) { + const uint32_t kMAX_UINT32 = std::numeric_limits::max(); + std::cout << "\t**[Device #" << device_index << "] Profile Configs: "; + std::cout << "\n\t\t**Original Profile Index: " + << (profile_config.original_profile_index == kMAX_UINT32 ? + "N/A" : std::to_string(profile_config.original_profile_index)) + << "\n\t\t**Original Profile Type: " + << profile_config.original_profile_type_str + << "\n\t\t**Original profile: " << profile_config.original_profile_type + << " (" << accelerator_types_map.at(profile_config.original_profile_type) << ")" + << "\n\t\t**Number of Profiles: " << profile_config.number_of_profiles + << "\n\t\t**Available_profiles: "; + } + std::string available_profiles_str = "N/A\n"; + for (uint32_t j = 0; j < profile_config.number_of_profiles; j++) { + if (available_profiles_str == "N/A\n") { + available_profiles_str.clear(); + } + + if (j + 1 >= profile_config.number_of_profiles) { + available_profiles_str += ("\n\t\t\tProfile[profile_index: " + + std::to_string(profile_config.available_profile_indices[j]) + + "]: " + profile_config.available_profile_str[j] + "\n"); + } else { + available_profiles_str += ("\n\t\t\tProfile[profile_index: " + + std::to_string(profile_config.available_profile_indices[j]) + + "]: " + profile_config.available_profile_str[j] + ", "); + } + } + if (isVerbose) { + std::cout << available_profiles_str; + } + return profile_config; +} + +void TestBase::waitForUserInput() { + for (;;) { + std::cout << "\n\t**Press any key to continue**" << std::endl; + int input = std::cin.get(); + if (input == EOF) { + std::cout << "EOF detected. Exiting." << std::endl; + return; + } + char input_char = static_cast(input); + std::cout << "User entered: " << input_char << std::endl; + if (input_char == '\n') { + return; + } + } +} + +uint32_t TestBase::promptNumDevicesToTest(uint32_t current_num_devices) { + uint32_t return_value = 0; + std::cout << "**How many devices would you like to test? (0 to skip): "; + std::string devices_to_test = ""; + do { + int input = std::cin.get(); + if (input == EOF) { + std::cout << "EOF detected. Exiting." << std::endl; + return 0; + } + char input_char = static_cast(input); + if (input_char == '\n') { + break; + } + if (input_char >= '0' && input_char <= '9') { + devices_to_test += input_char; + } else { + std::cout << "Invalid input. Please enter a number between 0 and " + << current_num_devices << std::endl; + } + } while (true); + + return_value = std::stoi(devices_to_test); + if (return_value > current_num_devices) { + std::cout << "Invalid input. Please enter a number between 0 and " + << current_num_devices << std::endl; + return 0; + } + return return_value; +} + +std::string TestBase::getResourceType(amdsmi_accelerator_partition_resource_type_t resource_type) { + std::string resource_type_str = ""; + switch (resource_type) { + case AMDSMI_ACCELERATOR_XCC: + resource_type_str = "XCC"; + break; + case AMDSMI_ACCELERATOR_ENCODER: + resource_type_str = "ENCODER"; + break; + case AMDSMI_ACCELERATOR_DECODER: + resource_type_str = "DECODER"; + break; + case AMDSMI_ACCELERATOR_DMA: + resource_type_str = "DMA"; + break; + case AMDSMI_ACCELERATOR_JPEG: + resource_type_str = "JPEG"; + break; + case AMDSMI_ACCELERATOR_MAX: + resource_type_str = "MAX"; + break; + default: + resource_type_str = "N/A"; + break; + } + return resource_type_str; +} + diff --git a/tests/amd_smi_test/test_base.h b/tests/amd_smi_test/test_base.h index 9e49bb2e0b..a1b186d4b6 100644 --- a/tests/amd_smi_test/test_base.h +++ b/tests/amd_smi_test/test_base.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "amd_smi/amdsmi.h" // The max devices can be monitored @@ -98,6 +99,46 @@ class TestBase { return num_iterations_; } + const std::map partition_types_map = { + { AMDSMI_ACCELERATOR_PARTITION_INVALID, "N/A" }, + { AMDSMI_ACCELERATOR_PARTITION_SPX, "SPX" }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, "DPX" }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, "TPX" }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, "QPX" }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, "CPX" }, + { AMDSMI_ACCELERATOR_PARTITION_MAX, "MAX" }, + }; + + const std::map accelerator_types_map = { + { AMDSMI_ACCELERATOR_PARTITION_INVALID, "AMDSMI_ACCELERATOR_PARTITION_INVALID" }, + { AMDSMI_ACCELERATOR_PARTITION_SPX, "AMDSMI_ACCELERATOR_PARTITION_SPX" }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, "AMDSMI_ACCELERATOR_PARTITION_DPX" }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, "AMDSMI_ACCELERATOR_PARTITION_TPX" }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, "AMDSMI_ACCELERATOR_PARTITION_QPX" }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, "AMDSMI_ACCELERATOR_PARTITION_CPX" }, + { AMDSMI_ACCELERATOR_PARTITION_MAX, "AMDSMI_ACCELERATOR_PARTITION_MAX" }, + }; + + struct AcceleratorProfileConfig { + amdsmi_accelerator_partition_type_t original_profile_type; + std::string original_profile_type_str; + uint32_t original_profile_index; + uint32_t number_of_profiles; + std::vector available_profiles; + std::vector available_profile_str; + std::vector available_profile_indices; + }; + + AcceleratorProfileConfig getAvailableProfileConfigs(uint32_t device_index, + amdsmi_accelerator_partition_profile_t current_profile, + amdsmi_accelerator_partition_profile_config_t config, + bool isVerbose); + void waitForUserInput(); + + uint32_t promptNumDevicesToTest(uint32_t current_num_devices); + + std::string getResourceType(amdsmi_accelerator_partition_resource_type_t resource_type); + protected: void MakeHeaderStr(const char *inStr, std::string *outStr) const; void PrintDeviceHeader(amdsmi_processor_handle dv_ind); diff --git a/tests/python_unittest/integration_test.py b/tests/python_unittest/integration_test.py index 6c4f8c8169..6f764e1575 100755 --- a/tests/python_unittest/integration_test.py +++ b/tests/python_unittest/integration_test.py @@ -121,6 +121,83 @@ class TestAmdSmiPythonInterface(unittest.TestCase): print() self.tearDown() + # amdsmi_get_vram_info should be supported on all ASICs + @handle_exceptions + def test_get_vram_info(self): + self.setUp() + processors = amdsmi.amdsmi_get_processor_handles() + self.assertGreaterEqual(len(processors), 1) + self.assertLessEqual(len(processors), 32) + for i in range(0, len(processors)): + bdf = amdsmi.amdsmi_get_gpu_device_bdf(processors[i]) + print("\n\n###Test Processor {}, bdf: {}".format(i, bdf)) + print("\n###Test amdsmi_get_gpu_vram_info \n") + + vram_types = { + amdsmi.AmdSmiVramType.UNKNOWN: "UNKNOWN", + amdsmi.AmdSmiVramType.HBM: "HBM", + amdsmi.AmdSmiVramType.HBM2: "HBM2", + amdsmi.AmdSmiVramType.HBM2E: "HBM2E", + amdsmi.AmdSmiVramType.HBM3: "HBM3", + amdsmi.AmdSmiVramType.DDR2: "DDR2", + amdsmi.AmdSmiVramType.DDR3: "DDR3", + amdsmi.AmdSmiVramType.DDR4: "DDR4", + amdsmi.AmdSmiVramType.GDDR1: "GDDR1", + amdsmi.AmdSmiVramType.GDDR2: "GDDR2", + amdsmi.AmdSmiVramType.GDDR3: "GDDR3", + amdsmi.AmdSmiVramType.GDDR4: "GDDR4", + amdsmi.AmdSmiVramType.GDDR5: "GDDR5", + amdsmi.AmdSmiVramType.GDDR6: "GDDR6", + amdsmi.AmdSmiVramType.GDDR7: "GDDR7", + amdsmi.AmdSmiVramType.MAX: "MAX" + } + + vram_vendors = { + amdsmi.AmdSmiVramVendor.SAMSUNG: "SAMSUNG", + amdsmi.AmdSmiVramVendor.INFINEON: "INFINEON", + amdsmi.AmdSmiVramVendor.ELPIDA: "ELPIDA", + amdsmi.AmdSmiVramVendor.ETRON: "ETRON", + amdsmi.AmdSmiVramVendor.NANYA: "NANYA", + amdsmi.AmdSmiVramVendor.HYNIX: "HYNIX", + amdsmi.AmdSmiVramVendor.MOSEL: "MOSEL", + amdsmi.AmdSmiVramVendor.WINBOND: "WINBOND", + amdsmi.AmdSmiVramVendor.ESMT: "ESMT", + amdsmi.AmdSmiVramVendor.MICRON: "MICRON", + amdsmi.AmdSmiVramVendor.UNKNOWN: "UNKNOWN" + } + + vram_info = amdsmi.amdsmi_get_gpu_vram_info(processors[i]) + print(" vram_info['vram_type'] is: {}".format( + vram_types[vram_info['vram_type']])) + print(" vram_info['vram_vendor'] is: {}".format( + vram_vendors[vram_info['vram_vendor']])) + print(" vram_info['vram_size'] is: {} MB".format( + vram_info['vram_size'])) + print(" vram_info['vram_bit_width'] is: {}".format( + vram_info['vram_bit_width'])) + print(" vram_info['vram_max_bandwidth'] is: {} GB/s".format( + vram_info['vram_max_bandwidth'])) + print() + self.tearDown() + + # amdsmi_get_gpu_xcd_counter should be supported on all ASICs + @handle_exceptions + def test_get_xcd_counter(self): + self.setUp() + processors = amdsmi.amdsmi_get_processor_handles() + self.assertGreaterEqual(len(processors), 1) + self.assertLessEqual(len(processors), 32) + for i in range(0, len(processors)): + bdf = amdsmi.amdsmi_get_gpu_device_bdf(processors[i]) + print("\n\n###Test Processor {}, bdf: {}".format(i, bdf)) + print("\n###Test amdsmi_get_gpu_xcd_counter \n") + + xcd_count = amdsmi.amdsmi_get_gpu_xcd_counter(processors[i]) + print(" xcd_counter['counter'] is: {}".format( + xcd_count)) + print() + self.tearDown() + # amdsmi_get_gpu_bad_page_info is not supported in Navi2x, Navi3x @handle_exceptions def test_bad_page_info(self): @@ -863,6 +940,44 @@ class TestAmdSmiPythonInterface(unittest.TestCase): accelerator_partition = amdsmi.amdsmi_get_gpu_accelerator_partition_profile(processors[i]) print(" Current partition id: {}".format( accelerator_partition['partition_id'])) + print(" Profile_type: {}".format( + accelerator_partition['partition_profile']['profile_type'])) + print(" profile_index: {}".format( + accelerator_partition['partition_profile']['profile_index'])) + print(" memory_caps: {}".format( + accelerator_partition['partition_profile']['memory_caps'])) + print(" num_resources: {}".format( + accelerator_partition['partition_profile']['num_resources'])) + print() + self.tearDown() + + # Requires sudo (to see full resource/config detail). + # Should only be supported on MI300+ ASICs + @handle_exceptions + def test_accelerator_partition_profile_config(self): + self.setUp() + processors = amdsmi.amdsmi_get_processor_handles() + self.assertGreaterEqual(len(processors), 1) + self.assertLessEqual(len(processors), 32) + for i in range(0, len(processors)): + bdf = amdsmi.amdsmi_get_gpu_device_bdf(processors[i]) + print("\n\n###Test Processor {}, bdf: {}".format(i, bdf)) + print("\n###Test amdsmi_get_gpu_accelerator_partition_profile_config \n") + profile_config = amdsmi.amdsmi_get_gpu_accelerator_partition_profile_config(processors[i]) + print(" num_profiles: {}".format(profile_config['num_profiles'])) + print(" num_resource_profiles: {}".format(profile_config['num_resource_profiles'])) + print(" default_profile_index: {}".format(profile_config['default_profile_index'])) + for p in profile_config['profiles']: + print("\t\t profile_type: {}".format(p['profile_type'])) + print("\t\t num_partitions: {}".format(p['num_partitions'])) + print("\t\t profile_index: {}".format(p['profile_index'])) + print("\t\t num_resources: {}".format(p['num_resources'])) + for r in range(0, p['num_resources']): + print("\t\t\t profile_index: {}".format(p['resources'][r]['profile_index'])) + print("\t\t\t resource_type: {}".format(p['resources'][r]['resource_type'])) + print("\t\t\t partition_resource: {}".format(p['resources'][r]['partition_resource'])) + print("\t\t\t num_partitions_share_resource: {}".format( + p['resources'][r]['num_partitions_share_resource'])) print() self.tearDown()