From df6de25624a299ca01dccd52c2dbbc3672230821 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Tue, 27 May 2025 19:19:43 -0500 Subject: [PATCH] [SWDEV-529030/SWDEV-531217] Fix tests & output for partitioned configurations (CPX, DPX, QPX, etc.) Changes: - Updated AMD SMI firmware to display "N/A" for unavailable firmware in partitioned environments, improving clarity. Example (in DPX): $ amd-smi firmware GPU: 0 FW_LIST: ... FW 12: FW_ID: PM FW_VERSION: 00.86.39.00 GPU: 1 FW_LIST: N/A - Fixed amd-smi partition not showing current partition information on asics with inablity to set memory or accelerator partitions. $ amd-smi partition -c -m CURRENT_PARTITION: GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID 0 NPS1 CPX 2 0 1 N/A N/A N/A 1 2 N/A N/A N/A 2 3 N/A N/A N/A 3 4 N/A N/A N/A 4 5 N/A N/A N/A 5 6 NPS1 SPX 0 0 7 NPS1 SPX 0 0 8 NPS1 SPX 0 0 MEMORY_PARTITION: GPU_ID MEMORY_PARTITION_CAPS CURRENT_MEMORY_PARTITION 0 N/A NPS1 1 N/A N/A 2 N/A N/A 3 N/A N/A 4 N/A N/A 5 N/A N/A 6 N/A NPS1 7 N/A NPS1 8 N/A NPS1 - Refactored amd_smi_drm_example.cc: - Grouped partition changes and restores original partition settings. - Now handles partitioned environments allowing example to continue even if some APIs are not supported in partitioned configurations. - Modified amdsmi_asic_info_t (see amdsmi_get_gpu_asic_info()) to report OAM ID as N/A if 0xFFFFFFFF (was 0xFFFF). Allows for better handling of OAM IDs in partitioned environments (DNE for non-primary nodes, since its a physical identifier). Easier to handle in tests and example code (ie. now consistent w/ max size of the structure's value). - Introduced amdsmi_RAII_open_FD() (internal API) to manage file descriptors using RAII, ensuring proper closure and preventing resource leaks. Updated the following APIs to use this function: - amdsmi_get_gpu_asic_info(), amdsmi_get_gpu_vram_usage(), amdsmi_get_gpu_vram_info(), amdsmi_get_gpu_vbios_info(), amdsmi_get_gpu_driver_info(), amdsmi_get_gpu_virtualization_mode() - Updated AMD SMI test_base.cc/.h: - Improved output and handling for partitioned environments. - Added detailed ASIC information logging to align with structure changes. - Enhanced error messages for better context before ASSERT checks. - Resolved test failures in partitioned environments by updating logic and handling for partition-specific configurations. Fixed tests include: - computepartition_read_write.cc, frequencies_read_write.cc, gpu_metrics_read.cc, mem_util_read.cc, memorypartition_read_write.cc, perf_level_read.cc, perf_level_read_write.cc, power_cap_read_write.cc, power_read.cc, sys_info_read.cc, gpu_busy_read.cc Change-Id: I36e903f8fddd714c74c719459c71aba8bbb77e6f Signed-off-by: Charis Poag Resetting head + adding fixes for tests ran in partitions Change-Id: I0c1e9ac07488b50c95f3bc6d8a724e67d2c715dc Signed-off-by: Charis Poag [ROCm/amdsmi commit: 391451752bcc6b9e8d6cd48a2f64d6aacaaf246c] --- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 5 +- .../amdsmi/example/amd_smi_drm_example.cc | 1760 ++++++++++------- projects/amdsmi/include/amd_smi/amdsmi.h | 2 +- .../include/amd_smi/impl/amd_smi_utils.h | 19 + .../amdsmi/py-interface/amdsmi_interface.py | 45 +- projects/amdsmi/rocm_smi/src/rocm_smi.cc | 91 +- .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 13 +- .../rocm_smi/src/rocm_smi_gpu_metrics.cc | 19 +- projects/amdsmi/src/amd_smi/amd_smi.cc | 358 ++-- projects/amdsmi/src/amd_smi/amd_smi_drm.cc | 29 +- projects/amdsmi/src/amd_smi/amd_smi_system.cc | 1 + projects/amdsmi/src/amd_smi/amd_smi_utils.cc | 145 +- .../functional/computepartition_read_write.cc | 43 +- .../amd_smi_test/functional/err_cnt_read.cc | 7 +- .../functional/frequencies_read_write.cc | 29 +- .../amd_smi_test/functional/gpu_busy_read.cc | 29 +- .../functional/gpu_metrics_read.cc | 2 +- .../amd_smi_test/functional/mem_util_read.cc | 34 +- .../functional/memorypartition_read_write.cc | 35 +- .../functional/perf_level_read.cc | 15 +- .../functional/perf_level_read_write.cc | 14 +- .../functional/power_cap_read_write.cc | 55 +- .../amd_smi_test/functional/power_read.cc | 9 +- .../amd_smi_test/functional/sys_info_read.cc | 18 +- projects/amdsmi/tests/amd_smi_test/main.cc | 4 +- .../amdsmi/tests/amd_smi_test/test_base.cc | 117 +- .../amdsmi/tests/amd_smi_test/test_base.h | 23 +- 27 files changed, 1858 insertions(+), 1063 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 8174d717da..fc4d88f4c4 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -323,7 +323,10 @@ class AMDSMILogger(): if isinstance(value, dict): yaml_string += " " * indent + f"{key}:\n" + self.custom_dump(value, indent + 1) elif isinstance(value, list): - yaml_string += " " * indent + f"{key}:\n" + if not value: + yaml_string += " " * indent + f"{key}: N/A\n" + elif isinstance(value, dict): + yaml_string += " " * indent + f"{key}:\n" for item in value: if isinstance(item, dict): yaml_string += self.custom_dump(item, indent + 1) diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 6b0f95f4f9..93a0038948 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -49,6 +49,17 @@ } \ } +#define PRINT_AMDSMI_RET(RET) \ + { \ + if (RET != AMDSMI_STATUS_SUCCESS) { \ + const char *err_str; \ + std::cout << "AMDSMI call returned " << RET << " at line " \ + << __LINE__ << std::endl; \ + amdsmi_status_code_to_string(RET, &err_str); \ + std::cout << err_str << std::endl; \ + } \ + } + void getFWNameFromId(int id, char *name) { @@ -260,11 +271,11 @@ mapStringToSMIMemoryPartitionTypes { static const std::map virtualization_mode_map = { - {AMDSMI_VIRTUALIZATION_MODE_UNKNOWN, "UNKNOWN"}, - {AMDSMI_VIRTUALIZATION_MODE_BAREMETAL, "BAREMETAL"}, - { AMDSMI_VIRTUALIZATION_MODE_HOST, "HOST"}, - { AMDSMI_VIRTUALIZATION_MODE_GUEST, "GUEST"}, - {AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH, "PASSTHROUGH"} + {AMDSMI_VIRTUALIZATION_MODE_UNKNOWN, "UNKNOWN"}, + {AMDSMI_VIRTUALIZATION_MODE_BAREMETAL, "BAREMETAL"}, + {AMDSMI_VIRTUALIZATION_MODE_HOST, "HOST"}, + {AMDSMI_VIRTUALIZATION_MODE_GUEST, "GUEST"}, + {AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH, "PASSTHROUGH"} }; static const std::map @@ -278,9 +289,20 @@ static const std::map {AMDSMI_PROCESSOR_TYPE_AMD_APU, "AMD_APU"} }; +static const std::map + link_type_map = { + {AMDSMI_LINK_TYPE_INTERNAL, "INTERNAL"}, + {AMDSMI_LINK_TYPE_XGMI, "XGMI"}, + {AMDSMI_LINK_TYPE_PCIE, "PCIE"}, + {AMDSMI_LINK_TYPE_NOT_APPLICABLE, "NOT_APPLICABLE"}, + {AMDSMI_LINK_TYPE_UNKNOWN, "UNKNOWN"} +}; + int main() { - amdsmi_status_t ret, ret_set; - const char *err_str; + amdsmi_status_t ret; + std::vector orig_accelerator_partitions; + std::vector orig_memory_partitions; + uint32_t gpu_number = 0; // Init amdsmi for sockets and devices. // Here we are only interested in AMD_GPUS. @@ -302,13 +324,496 @@ int main() { std::cout << "Total Socket: " << socket_count << std::endl; + // WARNING: Do not put any other settings before/inside/or between these lambda functions + // Required to save/change/reset the compute/accelerator & memory partition settings + // Reason: Modifies total number of gpu count, which will affect other API calls. + // Requires amdsmi_shut_down()/amdsmi_init(AMDSMI_INIT_AMD_GPUS) to re-enumerate + // total number of GPUs (AKA "processors per socket"). + // Changing back to original settings (compute/accelerator & memory partition) + // will not modify the GPU count. + // Save all original partition settings for later + auto save_original_partitions = [socket_count, &ret, sockets]( + std::vector& orig_partitions, + std::vector& orig_memory_partitions, + uint32_t& gpu_number) -> void { + std::cout << " **Saving Original Compute/Accelerator & Memory Partition Settings**\n"; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + PRINT_AMDSMI_RET(ret) + std::cout << "\t**Socket Info: " << socket_info << std::endl; + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + PRINT_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], + &device_count, &processor_handles[0]); + PRINT_AMDSMI_RET(ret) + + std::cout << "\t**Processor Count: " << device_count << std::endl; + + // For each device of the socket, get name and temperature. + for (uint32_t device_index = 0; device_index < device_count; device_index++) { + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl; + std::cout << "\t**GPU Number: " << gpu_number << std::endl; + + // Get the original compute partition + char original_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], + original_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + + const char* err_str; + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << mapStringToSMIComputePartitionTypes.at(original_compute_partition) + << "): " << err_str << "\n\n"; + std::cout << "\tCompute Partition (original): " + << original_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + + // Save the original compute/accelerator partition + if (ret == AMDSMI_STATUS_SUCCESS) { + orig_partitions.push_back( + mapStringToSMIComputePartitionTypes.at(original_compute_partition)); + } else { + orig_partitions.push_back(AMDSMI_COMPUTE_PARTITION_INVALID); + } + + // Get the original memory partition + char original_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], + original_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_memory_partition:\n"; + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << mapStringToSMIMemoryPartitionTypes.at(original_memory_partition) + << "): " << err_str << "\n\n"; + std::cout << "\tMemory Partition (original): " + << original_memory_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " + << err_str << "\n\n"; + } + + // Save the original memory partition + if (ret == AMDSMI_STATUS_SUCCESS) { + orig_memory_partitions.push_back( + mapStringToSMIMemoryPartitionTypes.at(original_memory_partition)); + } else { + orig_memory_partitions.push_back(AMDSMI_MEMORY_PARTITION_UNKNOWN); + } + gpu_number++; + } + } + // Reset GPU number for the next loop + gpu_number = 0; + }; + // Save the original compute/accelerator & memory partition settings + save_original_partitions(orig_accelerator_partitions, orig_memory_partitions, gpu_number); + + std::cout << " **Version 1: Accelerator/Compute Partition & memory API Examples**\n"; + auto process_accelerator_partitions = [socket_count, &ret, sockets]( + uint32_t& gpu_number) -> void { + std::cout << " **Process Compute/Accelerator & Memory Partition Settings**\n"; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + PRINT_AMDSMI_RET(ret) + std::cout << "\t**Socket Info: " << socket_info << std::endl; + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + PRINT_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], + &device_count, &processor_handles[0]); + PRINT_AMDSMI_RET(ret) + + std::cout << "\t**Processor Count: " << device_count << std::endl; + + // For each device of the socket, get name and temperature. + for (uint32_t device_index = 0; device_index < device_count; device_index++) { + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl; + std::cout << "\t**GPU Number: " << gpu_number << std::endl; + + // Get the original compute partition + char original_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], + original_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + + const char* err_str; + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << mapStringToSMIComputePartitionTypes.at(original_compute_partition) + << "): " << err_str << "\n\n"; + std::cout << "\tCompute Partition (original): " + << original_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + + // Iterate through all compute partitions + for (int partition = static_cast(AMDSMI_COMPUTE_PARTITION_SPX); + partition <= static_cast(AMDSMI_COMPUTE_PARTITION_CPX); + partition++) { + amdsmi_compute_partition_type_t updatePartition = + static_cast(partition); + amdsmi_status_t ret_set = amdsmi_set_gpu_compute_partition( + processor_handles[device_index], + updatePartition); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret_set) + } + std::cout << "\tamdsmi_set_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(updatePartition) << "): " + << err_str << "\n\n"; + + // Get the current compute partition + char current_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], + current_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(updatePartition) << "): " + << err_str << "\n\n"; + std::cout << "\tCompute Partition (current): " + << current_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + } + gpu_number++; + } + } + // Reset GPU number for the next loop + gpu_number = 0; + }; + process_accelerator_partitions(gpu_number); + + auto process_memory_partitions = [socket_count, &ret, sockets]( + uint32_t& gpu_number) -> void { + std::cout << " **Process Memory Partition Settings**\n"; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + PRINT_AMDSMI_RET(ret) + std::cout << "\t**Socket Info: " << socket_info << std::endl; + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + PRINT_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], + &device_count, &processor_handles[0]); + PRINT_AMDSMI_RET(ret) + + std::cout << "\t**Processor Count: " << device_count << std::endl; + + // For each device of the socket, get name and temperature. + for (uint32_t device_index = 0; device_index < device_count; device_index++) { + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl; + std::cout << "\t**GPU Number: " << gpu_number << std::endl; + + // Get the original memory partition + char original_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], + original_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + const char* err_str; + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_memory_partition:\n"; + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << mapStringToSMIMemoryPartitionTypes.at(original_memory_partition) + << "): " << err_str << "\n\n"; + std::cout << "\tMemory Partition (original): " << original_memory_partition + << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " + << err_str << "\n\n"; + } + + // Since memory partition effects entire GPU hive (and modifies current + // compute/accelerator partition), we'll default to only changing the + // first device for the first socket (GPU #0) + if (gpu_number == 0) { + std::cout << " **Changing memory partition for GPU #" + << gpu_number << "...**\n"; + for (int partition = static_cast(AMDSMI_MEMORY_PARTITION_NPS1); + partition <= static_cast(AMDSMI_MEMORY_PARTITION_NPS8); + partition++) { + if (partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS1) && + partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS2) && + partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS4) && + partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS8)) { + continue; + } + amdsmi_memory_partition_type_t updatePartition = + static_cast(partition); + auto ret_set = amdsmi_set_gpu_memory_partition( + processor_handles[device_index], updatePartition); + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret_set) + std::cout << " Output of amdsmi_set_gpu_memory_partition:\n"; + } + std::cout << "\tamdsmi_set_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(updatePartition) << "): " + << err_str << "\n\n"; + + // Get the current memory partition + char current_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], + current_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number + << ", " << memoryPartitionString(updatePartition) << "): " + << err_str << "\n\n"; + std::cout << "\tMemory Partition (current): " + << current_memory_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" + << gpu_number << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) + << "): " << err_str << "\n\n"; + } + } + } else { + std::cout << " **Skipping memory partition change for GPU #" << gpu_number + << "...**\n"; + } + gpu_number++; + } + } + // Reset GPU number for the next loop + gpu_number = 0; + }; + process_memory_partitions(gpu_number); + + auto reset_memory_partitions = [socket_count, &ret, sockets]( + const std::vector& orig_partitions, + uint32_t& gpu_number) -> void { + std::cout << " **Version 1: Memory Partition API Examples**\n"; + std::cout << " **Resetting Memory Partition Settings**\n"; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + PRINT_AMDSMI_RET(ret) + std::cout << "\t**Socket Info: " << socket_info << std::endl; + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + PRINT_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], + &device_count, &processor_handles[0]); + PRINT_AMDSMI_RET(ret) + + std::cout << "\t**Processor Count: " << device_count << std::endl; + + // For each device of the socket, get name and temperature. + for (uint32_t device_index = 0; device_index < device_count; device_index++) { + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl; + std::cout << "\t**GPU Number: " << gpu_number << std::endl; + + // Reset to original memory partition settings + amdsmi_memory_partition_type_t orig_partition = + orig_partitions[gpu_number]; + amdsmi_status_t ret_set = amdsmi_set_gpu_memory_partition( + processor_handles[device_index], orig_partition); + const char* err_str; + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret_set) + std::cout << " Output of amdsmi_set_gpu_memory_partition:\n"; + } + std::cout << "\tamdsmi_set_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(orig_partition) << "): " + << err_str << "\n\n"; + // Get the current memory partition + char current_memory_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], + current_memory_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_memory_partition:\n"; + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(orig_partition) << "): " + << err_str << "\n\n"; + std::cout << "\tMemory Partition (current): " + << current_memory_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_memory_partition(" << gpu_number << ", " + << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " + << err_str << "\n\n"; + } + gpu_number++; + } + } + // Reset GPU number for the next loop + gpu_number = 0; + }; + // Reset to original memory partition settings + reset_memory_partitions(orig_memory_partitions, gpu_number); + + auto reset_accelerator_partitions = [socket_count, &ret, sockets]( + const std::vector& orig_partitions, + uint32_t& gpu_number) -> void { + std::cout << " **Version 1: Memory Partition API Examples**\n"; + std::cout << " **Resetting Compute/Accelerator Partition Settings**\n"; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); + PRINT_AMDSMI_RET(ret) + std::cout << "\t**Socket Info: " << socket_info << std::endl; + + // Get the device count available for the socket. + uint32_t device_count = 0; + ret = amdsmi_get_processor_handles(sockets[i], &device_count, nullptr); + PRINT_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector processor_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_processor_handles(sockets[i], + &device_count, &processor_handles[0]); + PRINT_AMDSMI_RET(ret) + + std::cout << "\t**Processor Count: " << device_count << std::endl; + + // For each device of the socket, get name and temperature. + for (uint32_t device_index = 0; device_index < device_count; device_index++) { + std::cout << "\t**Device Index: " << device_index << std::endl; + std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl; + std::cout << "\t**GPU Number: " << gpu_number << std::endl; + + // Reset to original compute/accelerator partition settings + amdsmi_compute_partition_type_t orig_partition = + orig_partitions[gpu_number]; + amdsmi_status_t ret_set = amdsmi_set_gpu_compute_partition( + processor_handles[device_index], orig_partition); + const char* err_str; + amdsmi_status_code_to_string(ret_set, &err_str); + if (ret_set == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret_set) + std::cout << " Output of amdsmi_set_gpu_compute_partition:\n"; + } + std::cout << "\tamdsmi_set_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(orig_partition) << "): " + << err_str << "\n\n"; + + // Get the current compute/accelerator partition + char current_compute_partition[AMDSMI_MAX_STRING_LENGTH]; + ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], + current_compute_partition, + static_cast(AMDSMI_MAX_STRING_LENGTH)); + amdsmi_status_code_to_string(ret, &err_str); + if (ret == AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(orig_partition) << "): " + << err_str << "\n\n"; + std::cout << "\tCompute Partition (current): " + << current_compute_partition << "\n\n"; + } else { + std::cout << "\tamdsmi_get_gpu_compute_partition(" << gpu_number << ", " + << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " + << err_str << "\n\n"; + } + gpu_number++; + } + } + // Reset GPU number for the next loop + gpu_number = 0; + }; + // Reset to original compute/accelerator partition settings + reset_accelerator_partitions(orig_accelerator_partitions, gpu_number); + + // WARNING: Do not put any other settings before/inside/or between these lambda functions + // Required to save/change/reset the compute/accelerator & memory partition settings + // Reason: Modifies total number of gpu count, which will affect other API calls. + // Requires amdsmi_shut_down()/amdsmi_init(AMDSMI_INIT_AMD_GPUS) to re-enumerate + // total number of GPUs (AKA "processors per socket"). + // Changing back to original settings (compute/accelerator & memory partition) + // will not modify the GPU count. + // Add new functionality below this line! + // For each socket, get identifier and devices for (uint32_t i = 0; i < socket_count; i++) { // Get Socket info char socket_info[128]; ret = amdsmi_get_socket_info(sockets[i], 128, socket_info); CHK_AMDSMI_RET(ret) - std::cout << "Socket " << socket_info << std::endl; + std::cout << "Socket Info: " << socket_info << std::endl; // Get the device count available for the socket. uint32_t device_count = 0; @@ -377,20 +882,36 @@ int main() { CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_gpu_asic_info:\n"); printf("\tMarket Name: %s\n", asic_info.market_name); - printf("\tDeviceID: 0x%lx\n", asic_info.device_id); - printf("\tVendorID: 0x%x\n", asic_info.vendor_id); - printf("\tRevisionID: 0x%x\n", asic_info.rev_id); - printf("\tSubSystemID: 0x%x\n", asic_info.subsystem_id); + printf("\tDeviceID: 0x%04lx\n", asic_info.device_id); + printf("\tVendorID: 0x%04x\n", asic_info.vendor_id); + printf("\tVendor Name: %s\n", asic_info.vendor_name); + printf("\tSubVendorID: 0x%04x\n", asic_info.subvendor_id); + printf("\tRevisionID: 0x%02x\n", asic_info.rev_id); + printf("\tSubSystemID: 0x%04x\n", asic_info.subsystem_id); printf("\tAsic serial: 0x%s\n", asic_info.asic_serial); - printf("\tNum of Computes: %d\n\n", asic_info.num_of_compute_units); + if (asic_info.oam_id != UINT32_MAX) { + // OAM ID is not supported on all devices + printf("\tOAM ID: %"PRIu32"\n", asic_info.oam_id); + } else { + // OAM ID is not supported on this device + printf("\tOAM ID: N/A\n"); + } + printf("\tNum of Computes: %d\n", asic_info.num_of_compute_units); + printf("\tTarget Graphics Version: gfx%lx\n\n", asic_info.target_graphics_version); bool is_power_management_enabled = false; ret = amdsmi_is_gpu_power_management_enabled(processor_handles[device_index], &is_power_management_enabled); - CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_is_gpu_power_management_enabled:\n"); - printf("\tPower Management Enabled: %s\n\n", - (is_power_management_enabled ? "TRUE" : "FALSE")); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + } + if (is_power_management_enabled && ret != AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\tPower Management is enabled" << std::endl; + } else { + std::cout << "\tPower Management is disabled" << std::endl; + } + amdsmi_virtualization_mode_t vmode; ret = amdsmi_get_gpu_virtualization_mode(processor_handles[device_index], &vmode); @@ -405,169 +926,10 @@ int main() { std::cout << "\t**Virtualization Mode: MAP TYPE UNKNOWN?" << std::endl; } - std::cout << " **Version 1: Accelerator/Compute Partition API Examples**\n"; - char original_compute_partition[AMDSMI_MAX_STRING_LENGTH]; - ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], original_compute_partition, - static_cast(AMDSMI_MAX_STRING_LENGTH)); - - amdsmi_status_code_to_string(ret, &err_str); - if (ret == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret) - std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; - std::cout << "\tamdsmi_get_gpu_compute_partition(" << device_index << ", " - << mapStringToSMIComputePartitionTypes.at(original_compute_partition) << "): " - << err_str << "\n\n"; - std::cout << "\tCompute Partition (original): " - << original_compute_partition << "\n\n"; - } else { - std::cout << "\tamdsmi_get_gpu_compute_partition(" << device_index << ", " - << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " - << err_str << "\n\n"; - } - - for (int partition = static_cast(AMDSMI_COMPUTE_PARTITION_SPX); - partition <= static_cast(AMDSMI_COMPUTE_PARTITION_CPX); - partition++) { - amdsmi_compute_partition_type_t updatePartition - = static_cast(partition); - ret_set = amdsmi_set_gpu_compute_partition(processor_handles[device_index], - updatePartition); - amdsmi_status_code_to_string(ret_set, &err_str); - if (ret_set == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret_set) - } - std::cout << "\tamdsmi_set_gpu_compute_partition(" << device_index << ", " - << computePartitionString(updatePartition) << "): " - << err_str << "\n\n"; - - // Get the current compute partition - char current_compute_partition[AMDSMI_MAX_STRING_LENGTH]; - ret = amdsmi_get_gpu_compute_partition(processor_handles[device_index], - current_compute_partition, - static_cast(AMDSMI_MAX_STRING_LENGTH)); - amdsmi_status_code_to_string(ret, &err_str); - if (ret == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret) - std::cout << " Output of amdsmi_get_gpu_compute_partition:\n"; - std::cout << "\tamdsmi_get_gpu_compute_partition(" << device_index << ", " - << computePartitionString(updatePartition) << "): " - << err_str << "\n\n"; - std::cout << "\tCompute Partition (current): " - << current_compute_partition << "\n\n"; - } else { - std::cout << "\tamdsmi_get_gpu_compute_partition(" << device_index << ", " - << computePartitionString(AMDSMI_COMPUTE_PARTITION_INVALID) << "): " - << err_str << "\n\n"; - } - } - // return to original compute partition - amdsmi_compute_partition_type_t original_compute_partition_type; - if (ret == AMDSMI_STATUS_SUCCESS) { - original_compute_partition_type - = mapStringToSMIComputePartitionTypes.at(original_compute_partition); - } else { - original_compute_partition_type = AMDSMI_COMPUTE_PARTITION_INVALID; - } - std::cout << " Returning to original compute partition (" - << computePartitionString(original_compute_partition_type) << ")\n"; - auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles[device_index], - original_compute_partition_type); - amdsmi_status_code_to_string(ret_set, &err_str); - if (ret_set == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret_set) - } - std::cout << "\tamdsmi_set_gpu_compute_partition(" << device_index << ", " - << computePartitionString(original_compute_partition_type) << "): " - << err_str << "\n\n"; - - std::cout << " **Version 1: Memory Partition API Examples**\n"; - char original_memory_partition[AMDSMI_MAX_STRING_LENGTH]; - ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], original_memory_partition, - static_cast(AMDSMI_MAX_STRING_LENGTH)); - amdsmi_status_code_to_string(ret, &err_str); - if (ret == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret) - std::cout << " Output of amdsmi_get_gpu_memory_partition:\n"; - std::cout << "\tamdsmi_get_gpu_memory_partition(" << device_index << ", " - << mapStringToSMIMemoryPartitionTypes.at(original_memory_partition) << "): " - << err_str << "\n\n"; - std::cout << "\tMemory Partition (original): " - << original_memory_partition << "\n\n"; - } else { - std::cout << "\tamdsmi_get_gpu_memory_partition(" << device_index << ", " - << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " - << err_str << "\n\n"; - } - - for (int partition = static_cast(AMDSMI_MEMORY_PARTITION_NPS1); - partition <= static_cast(AMDSMI_MEMORY_PARTITION_NPS8); - partition++) { - if (partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS1) - && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS2) - && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS4) - && partition != static_cast(AMDSMI_MEMORY_PARTITION_NPS8)) { - continue; - } - amdsmi_memory_partition_type_t updatePartition - = static_cast(partition); - auto ret_set = amdsmi_set_gpu_memory_partition(processor_handles[device_index], - updatePartition); - amdsmi_status_code_to_string(ret_set, &err_str); - if (ret_set == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret_set) - std::cout << " Output of amdsmi_set_gpu_memory_partition:\n"; - } - std::cout << "\tamdsmi_set_gpu_memory_partition(" << device_index << ", " - << memoryPartitionString(updatePartition) << "): " - << err_str << "\n\n"; - - // Get the current memory partition - char current_memory_partition[AMDSMI_MAX_STRING_LENGTH]; - ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index], - current_memory_partition, - static_cast(AMDSMI_MAX_STRING_LENGTH)); - - amdsmi_status_code_to_string(ret, &err_str); - if (ret == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret) - std::cout << "\tamdsmi_get_gpu_memory_partition(" << device_index << ", " - << memoryPartitionString(updatePartition) << "): " - << err_str << "\n\n"; - std::cout << "\tMemory Partition (current): " - << current_memory_partition << "\n\n"; - } else { - std::cout << "\tamdsmi_get_gpu_memory_partition(" << device_index << ", " - << memoryPartitionString(AMDSMI_MEMORY_PARTITION_UNKNOWN) << "): " - << err_str << "\n\n"; - } - } - // return to original compute partition - amdsmi_memory_partition_type_t original_memory_partition_type; - if (ret == AMDSMI_STATUS_SUCCESS) { - original_memory_partition_type - = mapStringToSMIMemoryPartitionTypes.at(original_memory_partition); - } else { - original_memory_partition_type = AMDSMI_MEMORY_PARTITION_UNKNOWN; - } - std::cout << " Returning to original memory partition (" - << memoryPartitionString(original_memory_partition_type) - << ")\n"; - ret_set = amdsmi_set_gpu_memory_partition(processor_handles[device_index], - original_memory_partition_type); - amdsmi_status_code_to_string(ret_set, &err_str); - if (ret_set == AMDSMI_STATUS_SUCCESS) { - CHK_AMDSMI_RET(ret_set) - } - std::cout << "\tamdsmi_set_gpu_compute_partition(" << device_index << ", " - << memoryPartitionString(original_memory_partition_type) << "): " - << err_str << "\n\n"; - - // TODO(amdsmi_team): Add V2 partiton APIs - // Get VRAM info amdsmi_vram_info_t vram_info = {}; ret = amdsmi_get_gpu_vram_info(processor_handles[device_index], &vram_info); - if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_gpu_vram_info:\n"); printf("\tVRAM Size: 0x%lx (%ld) \n", vram_info.vram_size, vram_info.vram_size); @@ -576,9 +938,34 @@ int main() { printf("\tVRAM max bandwidth: 0x%lx (%lu) \n\n", vram_info.vram_max_bandwidth, vram_info.vram_max_bandwidth); } else { - printf("\t**amdsmi_get_gpu_vram_info() not supported on this system.\n"); + printf("\t**amdsmi_get_gpu_vram_info(): not supported on this device.\n"); } + uint32_t mem_type = AMDSMI_MEM_TYPE_VRAM; + uint64_t total = 0; + ret = amdsmi_get_gpu_memory_total(processor_handles[device_index], + static_cast(mem_type), &total); + if (ret != AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << "\t**amdsmi_get_gpu_memory_total(): not supported on this device." + << std::endl; + } else { + CHK_AMDSMI_RET(ret) + std::cout << "\tGPU: " << gpu_number + << "; VRAM TOTAL: " << total / (1024 * 1024) << "MB\n"; + } + uint64_t usage = 0; + ret = amdsmi_get_gpu_memory_usage(processor_handles[device_index], + static_cast(mem_type), &usage); + if (ret != AMDSMI_STATUS_SUCCESS) { + PRINT_AMDSMI_RET(ret) + std::cout << "\t**amdsmi_get_gpu_memory_usage(): not supported on this device." + << std::endl; + } else { + CHK_AMDSMI_RET(ret) + std::cout << "\tGPU: " << gpu_number + << "; VRAM USED: " << usage / (1024 * 1024) << "MB\n"; + } // Get VBIOS info amdsmi_vbios_info_t vbios_info = {}; @@ -609,13 +996,15 @@ int main() { // Get power measure amdsmi_power_info_t power_measure = {}; ret = amdsmi_get_power_info(processor_handles[device_index], &power_measure); - CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_power_info:\n"); - printf("\tCurrent GFX Voltage: %d\n", - power_measure.gfx_voltage); - printf("\tAverage socket power: %d\n", - power_measure.average_socket_power); - printf("\tGPU Power limit: %d\n\n", power_measure.power_limit); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tCurrent GFX Voltage: %d\n", power_measure.gfx_voltage); + printf("\tAverage socket power: %d\n", power_measure.average_socket_power); + printf("\tGPU Power limit: %d\n\n", power_measure.power_limit); + } else { + printf("\tamdsmi_get_power_info(): not supported on this device.\n"); + } // Get driver version amdsmi_driver_info_t driver_info; @@ -637,14 +1026,15 @@ int main() { // Get engine usage info amdsmi_engine_usage_t engine_usage = {}; ret = amdsmi_get_gpu_activity(processor_handles[device_index], &engine_usage); - CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_gpu_activity:\n"); - printf("\tAverage GFX Activity: %d\n", - engine_usage.gfx_activity); - printf("\tAverage MM Activity: %d\n", - engine_usage.mm_activity); - printf("\tAverage UMC Activity: %d\n\n", - engine_usage.umc_activity); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tAverage GFX Activity: %d\n", engine_usage.gfx_activity); + printf("\tAverage MM Activity: %d\n", engine_usage.mm_activity); + printf("\tAverage UMC Activity: %d\n\n", engine_usage.umc_activity); + } else { + printf("\tamdsmi_get_gpu_activity(): not supported on this device.\n"); + } // Get firmware info amdsmi_fw_info_t fw_information = {}; @@ -662,57 +1052,82 @@ int main() { amdsmi_clk_info_t gfx_clk_values = {}; ret = amdsmi_get_clock_info(processor_handles[device_index], AMDSMI_CLK_TYPE_GFX, &gfx_clk_values); - CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_clock_info:\n"); - printf("\tGPU GFX Max Clock: %d\n", gfx_clk_values.max_clk); - printf("\tGPU GFX Current Clock: %d\n", gfx_clk_values.clk); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tGPU GFX Max Clock: %d\n", gfx_clk_values.max_clk); + printf("\tGPU GFX Current Clock: %d\n", gfx_clk_values.clk); + } else { + printf("\tamdsmi_get_clock_info(AMDSMI_CLK_TYPE_GFX): " + "not supported on this device.\n"); + } // Get MEM clock measurements amdsmi_clk_info_t mem_clk_values = {}; ret = amdsmi_get_clock_info(processor_handles[device_index], AMDSMI_CLK_TYPE_MEM, &mem_clk_values); - CHK_AMDSMI_RET(ret) - printf("\tGPU MEM Max Clock: %d\n", mem_clk_values.max_clk); - printf("\tGPU MEM Current Clock: %d\n\n", mem_clk_values.clk); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tGPU MEM Max Clock: %d\n", mem_clk_values.max_clk); + printf("\tGPU MEM Current Clock: %d\n\n", mem_clk_values.clk); + } else { + printf("\tamdsmi_get_clock_info(AMDSMI_CLK_TYPE_MEM): " + "not supported on this device.\n"); + } // Get PCIe status amdsmi_pcie_info_t pcie_info = {}; ret = amdsmi_get_pcie_info(processor_handles[device_index], &pcie_info); - CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_pcie_info:\n"); - printf("\tCurrent PCIe lanes: %d\n", pcie_info.pcie_metric.pcie_width); - printf("\tCurrent PCIe speed: %d\n", pcie_info.pcie_metric.pcie_speed); - printf("\tCurrent PCIe Interface Version: %d\n", - pcie_info.pcie_static.pcie_interface_version); - printf("\tPCIe slot type: %d\n", pcie_info.pcie_static.slot_type); - printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width); - printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf("\tCurrent PCIe lanes: %d\n", pcie_info.pcie_metric.pcie_width); + printf("\tCurrent PCIe speed: %d\n", pcie_info.pcie_metric.pcie_speed); + printf("\tCurrent PCIe Interface Version: %d\n", + pcie_info.pcie_static.pcie_interface_version); + printf("\tPCIe slot type: %d\n", pcie_info.pcie_static.slot_type); + printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width); + printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed); - // additional pcie related metrics - printf("\tPCIe bandwidth: %u\n", pcie_info.pcie_metric.pcie_bandwidth); - printf("\tPCIe replay count: %" PRIu64 "\n", pcie_info.pcie_metric.pcie_replay_count); - printf("\tPCIe L0 recovery count: %" PRIu64 "\n", pcie_info.pcie_metric.pcie_l0_to_recovery_count); - printf("\tPCIe rollover count: %" PRIu64 "\n", pcie_info.pcie_metric.pcie_replay_roll_over_count); - printf("\tPCIe nak received count: %" PRIu64 "\n", pcie_info.pcie_metric.pcie_nak_received_count); - printf("\tPCIe nak sent count: %" PRIu64 "\n", pcie_info.pcie_metric.pcie_nak_sent_count); + // additional pcie related metrics + printf("\tPCIe bandwidth: %u\n", pcie_info.pcie_metric.pcie_bandwidth); + printf("\tPCIe replay count: %" PRIu64 "\n", + pcie_info.pcie_metric.pcie_replay_count); + printf("\tPCIe L0 recovery count: %" PRIu64 "\n", + pcie_info.pcie_metric.pcie_l0_to_recovery_count); + printf("\tPCIe rollover count: %" PRIu64 "\n", + pcie_info.pcie_metric.pcie_replay_roll_over_count); + printf("\tPCIe nak received count: %" PRIu64 "\n", + pcie_info.pcie_metric.pcie_nak_received_count); + printf("\tPCIe nak sent count: %" PRIu64 "\n", + pcie_info.pcie_metric.pcie_nak_sent_count); + } // Get VRAM temperature limit int64_t temperature = 0; ret = amdsmi_get_temp_metric( processor_handles[device_index], AMDSMI_TEMPERATURE_TYPE_VRAM, AMDSMI_TEMP_CRITICAL, &temperature); - CHK_AMDSMI_RET(ret) - printf(" Output of amdsmi_get_temp_metric:\n"); - printf("\tGPU VRAM temp limit: %ld\n", temperature); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_temp_metric:\n"); + printf("\tGPU VRAM temp limit: %ld\n", temperature); + } else { + printf("\tamdsmi_get_temp_metric(AMDSMI_TEMPERATURE_TYPE_VRAM): " + "not supported on this device.\n"); + } // Get GFX temperature limit ret = amdsmi_get_temp_metric( processor_handles[device_index], AMDSMI_TEMPERATURE_TYPE_EDGE, AMDSMI_TEMP_CRITICAL, &temperature); - if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { CHK_AMDSMI_RET(ret) + printf("\tGPU GFX temp limit: %ld\n\n", temperature); + } else { + printf("\tamdsmi_get_temp_metric(AMDSMI_TEMPERATURE_TYPE_EDGE): " + "not supported on this device.\n"); } - printf("\tGPU GFX temp limit: %ld\n\n", temperature); // Get temperature measurements // amdsmi_temperature_t edge_temp, hotspot_temp, vram_temp, @@ -725,8 +1140,8 @@ int main() { ret = amdsmi_get_temp_metric( processor_handles[device_index], temp_type, AMDSMI_TEMP_CURRENT, - &temp_measurements[(int)(temp_type)]); - if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + &temp_measurements[static_cast(temp_type)]); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { CHK_AMDSMI_RET(ret) } } @@ -919,11 +1334,11 @@ int main() { int64_t val_i64 = 0; ret = amdsmi_get_temp_metric(processor_handles[device_index], AMDSMI_TEMPERATURE_TYPE_EDGE, AMDSMI_TEMP_CURRENT, &val_i64); - if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { CHK_AMDSMI_RET(ret) } printf(" Output of amdsmi_get_temp_metric:\n"); - std::cout << "\t\tTemperature: " << val_i64 << "C" + std::cout << "\t\tTemperature: " << std::dec << val_i64 << "C" << "\n\n"; // Get frame buffer @@ -936,465 +1351,481 @@ int main() { amdsmi_power_cap_info_t cap_info = {}; ret = amdsmi_get_power_cap_info(processor_handles[device_index], 0, &cap_info); - CHK_AMDSMI_RET(ret) - printf(" Output of amdsmi_get_power_cap_info:\n"); - std::cout << "\t\t Power Cap: " << cap_info.power_cap - << " uW\n"; - std::cout << "\t\t Default Power Cap: " << cap_info.default_power_cap - << " uW\n\n"; - std::cout << "\t\t Dpm Cap: " << cap_info.dpm_cap - << " MHz\n\n"; - std::cout << "\t\t Min Power Cap: " << cap_info.min_power_cap - << " uW\n\n"; - std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap - << " uW\n\n"; + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_power_cap_info:\n"); + std::cout << "\t\t Power Cap: " << cap_info.power_cap + << " uW\n"; + std::cout << "\t\t Default Power Cap: " << cap_info.default_power_cap + << " uW\n\n"; + std::cout << "\t\t Dpm Cap: " << cap_info.dpm_cap + << " MHz\n\n"; + std::cout << "\t\t Min Power Cap: " << cap_info.min_power_cap + << " uW\n\n"; + std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap + << " uW\n\n"; + } else { + std::cout << "\tamdsmi_get_power_cap_info(): not supported on this device.\n"; + } /// Get GPU Metrics info std::cout << "\n\n"; amdsmi_gpu_metrics_t smu; ret = amdsmi_get_gpu_metrics_info(processor_handles[device_index], &smu); - CHK_AMDSMI_RET(ret) - printf(" Output of amdsmi_get_gpu_metrics_info:\n"); - printf("\tDevice[%d] BDF %04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32 "\n\n", i, - static_cast(bdf.domain_number), - static_cast(bdf.bus_number), - static_cast(bdf.device_number), - static_cast(bdf.function_number)); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\tamdsmi_get_gpu_metrics_info(): not supported on this device.\n"; + } else { // START GPU METRICS OUTPUTS + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_gpu_metrics_info:\n"); + printf("\tDevice[%d] BDF %04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32 "\n\n", + i, static_cast(bdf.domain_number), + static_cast(bdf.bus_number), + static_cast(bdf.device_number), + static_cast(bdf.function_number)); - std::cout << "METRIC TABLE HEADER:\n"; - std::cout << "structure_size=" << std::dec - << static_cast(smu.common_header.structure_size) << "\n"; - std::cout << "\tformat_revision=" << std::dec - << static_cast(smu.common_header.format_revision) << "\n"; - std::cout << "\tcontent_revision=" << std::dec - << static_cast(smu.common_header.content_revision) << "\n"; + std::cout << "METRIC TABLE HEADER:\n"; + std::cout << "structure_size=" << std::dec + << static_cast(smu.common_header.structure_size) << "\n"; + std::cout << "\tformat_revision=" << std::dec + << static_cast(smu.common_header.format_revision) << "\n"; + std::cout << "\tcontent_revision=" << std::dec + << static_cast(smu.common_header.content_revision) << "\n"; - std::cout << "\n"; - std::cout << "TIME STAMPS (ns):\n"; - std::cout << std::dec << "\tsystem_clock_counter=" << smu.system_clock_counter << "\n"; - std::cout << "\tfirmware_timestamp (10ns resolution)=" << std::dec << smu.firmware_timestamp - << "\n"; + std::cout << "\n"; + std::cout << "TIME STAMPS (ns):\n"; + std::cout << std::dec << "\tsystem_clock_counter=" << smu.system_clock_counter << "\n"; + std::cout << "\tfirmware_timestamp (10ns resolution)=" << std::dec << smu.firmware_timestamp + << "\n"; - std::cout << "\n"; - std::cout << "TEMPERATURES (C):\n"; - std::cout << std::dec << "\ttemperature_edge= " << smu.temperature_edge << "\n"; - std::cout << std::dec << "\ttemperature_hotspot= " << smu.temperature_hotspot << "\n"; - std::cout << std::dec << "\ttemperature_mem= " << smu.temperature_mem << "\n"; - std::cout << std::dec << "\ttemperature_vrgfx= " << smu.temperature_vrgfx << "\n"; - std::cout << std::dec << "\ttemperature_vrsoc= " << smu.temperature_vrsoc << "\n"; - std::cout << std::dec << "\ttemperature_vrmem= " << smu.temperature_vrmem << "\n"; - std::cout << "\ttemperature_hbm = ["; - auto idx = 0; - for (const auto& temp : smu.temperature_hbm) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.temperature_hbm))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << "\n"; - std::cout << "UTILIZATION (%):\n"; - std::cout << std::dec << "\taverage_gfx_activity=" << smu.average_gfx_activity << "\n"; - std::cout << std::dec << "\taverage_umc_activity=" << smu.average_umc_activity << "\n"; - std::cout << std::dec << "\taverage_mm_activity=" << smu.average_mm_activity << "\n"; - std::cout << std::dec << "\tvcn_activity= ["; - idx = 0; - for (const auto& temp : smu.vcn_activity) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.vcn_activity))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << "\n"; - std::cout << std::dec << "\tjpeg_activity= ["; - idx = 0; - for (const auto& temp : smu.jpeg_activity) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.jpeg_activity))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << "\n"; - std::cout << "POWER (W)/ENERGY (15.259uJ per 1ns):\n"; - std::cout << std::dec << "\taverage_socket_power=" << smu.average_socket_power << "\n"; - std::cout << std::dec << "\tcurrent_socket_power=" << smu.current_socket_power << "\n"; - std::cout << std::dec << "\tenergy_accumulator=" << smu.energy_accumulator << "\n"; - - std::cout << "\n"; - std::cout << "AVG CLOCKS (MHz):\n"; - std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency - << "\n"; - std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency - << "\n"; - std::cout << std::dec << "\taverage_uclk_frequency=" << smu.average_uclk_frequency << "\n"; - std::cout << std::dec << "\taverage_vclk0_frequency=" << smu.average_vclk0_frequency - << "\n"; - std::cout << std::dec << "\taverage_dclk0_frequency=" << smu.average_dclk0_frequency - << "\n"; - std::cout << std::dec << "\taverage_vclk1_frequency=" << smu.average_vclk1_frequency - << "\n"; - std::cout << std::dec << "\taverage_dclk1_frequency=" << smu.average_dclk1_frequency - << "\n"; - - std::cout << "\n"; - std::cout << "CURRENT CLOCKS (MHz):\n"; - std::cout << std::dec << "\tcurrent_gfxclk=" << smu.current_gfxclk << "\n"; - std::cout << std::dec << "\tcurrent_gfxclks= ["; - idx = 0; - for (const auto& temp : smu.current_gfxclks) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.current_gfxclks))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\tcurrent_socclk=" << smu.current_socclk << "\n"; - std::cout << std::dec << "\tcurrent_socclks= ["; - idx = 0; - for (const auto& temp : smu.current_socclks) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.current_socclks))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\tcurrent_uclk=" << smu.current_uclk << "\n"; - std::cout << std::dec << "\tcurrent_vclk0=" << smu.current_vclk0 << "\n"; - std::cout << std::dec << "\tcurrent_vclk0s= ["; - idx = 0; - for (const auto& temp : smu.current_vclk0s) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.current_vclk0s))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\tcurrent_dclk0=" << smu.current_dclk0 << "\n"; - std::cout << std::dec << "\tcurrent_dclk0s= ["; - idx = 0; - for (const auto& temp : smu.current_dclk0s) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.current_dclk0s))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\tcurrent_vclk1=" << smu.current_vclk1 << "\n"; - std::cout << std::dec << "\tcurrent_dclk1=" << smu.current_dclk1 << "\n"; - - std::cout << "\n"; - std::cout << "TROTTLE STATUS:\n"; - std::cout << std::dec << "\tthrottle_status=" << smu.throttle_status << "\n"; - - std::cout << "\n"; - std::cout << "FAN SPEED:\n"; - std::cout << std::dec << "\tcurrent_fan_speed=" << smu.current_fan_speed << "\n"; - - std::cout << "\n"; - std::cout << "LINK WIDTH (number of lanes) /SPEED (0.1 GT/s):\n"; - std::cout << "\tpcie_link_width=" << smu.pcie_link_width << "\n"; - std::cout << "\tpcie_link_speed=" << smu.pcie_link_speed << "\n"; - std::cout << "\txgmi_link_width=" << smu.xgmi_link_width << "\n"; - std::cout << "\txgmi_link_speed=" << smu.xgmi_link_speed << "\n"; - - std::cout << "\n"; - std::cout << "Utilization Accumulated(%):\n"; - std::cout << "\tgfx_activity_acc=" << std::dec << smu.gfx_activity_acc << "\n"; - std::cout << "\tmem_activity_acc=" << std::dec << smu.mem_activity_acc << "\n"; - - std::cout << "\n"; - std::cout << "XGMI ACCUMULATED DATA TRANSFER SIZE (KB):\n"; - std::cout << std::dec << "\txgmi_read_data_acc= ["; - idx = 0; - for (const auto& temp : smu.xgmi_read_data_acc) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.xgmi_read_data_acc))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\txgmi_write_data_acc= ["; - idx = 0; - for (const auto& temp : smu.xgmi_write_data_acc) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.xgmi_write_data_acc))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - std::cout << std::dec << "\txgmi_link_status= ["; - idx = 0; - for (const auto& temp : smu.xgmi_link_status) { - std::cout << temp; - if ((idx + 1) != static_cast(std::size(smu.xgmi_link_status))) { - std::cout << ", "; - } else { - std::cout << "]\n"; - } - ++idx; - } - - // Voltage (mV) - std::cout << "\tvoltage_soc = " << std::dec << smu.voltage_soc << "\n"; - std::cout << "\tvoltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; - std::cout << "\tvoltage_mem = " << std::dec << smu.voltage_mem << "\n"; - - std::cout << "\tindep_throttle_status = " << std::dec << smu.indep_throttle_status << "\n"; - - // Clock Lock Status. Each bit corresponds to clock instance - std::cout << "\tgfxclk_lock_status (in hex) = " << std::hex - << smu.gfxclk_lock_status << std::dec <<"\n"; - - // Bandwidth (GB/sec) - std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; - std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; - - // VRAM max bandwidth at max memory clock - std::cout << "\tvram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; - - // Counts - std::cout << "\tpcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc - << "\n"; - std::cout << "\tpcie_replay_count_acc= " << std::dec << smu.pcie_replay_count_acc << "\n"; - std::cout << "\tpcie_replay_rover_count_acc= " << std::dec - << smu.pcie_replay_rover_count_acc << "\n"; - std::cout << "\tpcie_nak_sent_count_acc= " << std::dec << smu.pcie_nak_sent_count_acc - << "\n"; - std::cout << "\tpcie_nak_rcvd_count_acc= " << std::dec << smu.pcie_nak_rcvd_count_acc - << "\n"; - - // Accumulation cycle counter - // Accumulated throttler residencies - std::cout << "\n"; - std::cout << "RESIDENCY ACCUMULATION / COUNTER:\n"; - std::cout << "\taccumulation_counter = " << std::dec << smu.accumulation_counter << "\n"; - std::cout << "\tprochot_residency_acc = " << std::dec << smu.prochot_residency_acc << "\n"; - std::cout << "\tppt_residency_acc = " << std::dec << smu.ppt_residency_acc << "\n"; - std::cout << "\tsocket_thm_residency_acc = " << std::dec << smu.socket_thm_residency_acc - << "\n"; - std::cout << "\tvr_thm_residency_acc = " << std::dec << smu.vr_thm_residency_acc - << "\n"; - std::cout << "\thbm_thm_residency_acc = " << std::dec << smu.hbm_thm_residency_acc << "\n"; - - // Number of current partitions - std::cout << "\tnum_partition = " << std::dec << smu.num_partition << "\n"; - - // PCIE other end recovery counter - std::cout << "\tpcie_lc_perf_other_end_recovery = " - << std::dec << smu.pcie_lc_perf_other_end_recovery << "\n"; - - idx = 0; - auto idy = 0; - std::cout << "\txcp_stats.gfx_busy_inst: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_busy_inst) { - if ((idy + 1) != static_cast(std::size(row.gfx_busy_inst))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << "\n"; + std::cout << "TEMPERATURES (C):\n"; + std::cout << std::dec << "\ttemperature_edge= " << smu.temperature_edge << "\n"; + std::cout << std::dec << "\ttemperature_hotspot= " << smu.temperature_hotspot << "\n"; + std::cout << std::dec << "\ttemperature_mem= " << smu.temperature_mem << "\n"; + std::cout << std::dec << "\ttemperature_vrgfx= " << smu.temperature_vrgfx << "\n"; + std::cout << std::dec << "\ttemperature_vrsoc= " << smu.temperature_vrsoc << "\n"; + std::cout << std::dec << "\ttemperature_vrmem= " << smu.temperature_vrmem << "\n"; + std::cout << "\ttemperature_hbm = ["; + auto idx = 0; + for (const auto& temp : smu.temperature_hbm) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.temperature_hbm))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.vcn_busy: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.vcn_busy) { - if ((idy + 1) != static_cast(std::size(row.vcn_busy))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << "\n"; + std::cout << "UTILIZATION (%):\n"; + std::cout << std::dec << "\taverage_gfx_activity=" << smu.average_gfx_activity << "\n"; + std::cout << std::dec << "\taverage_umc_activity=" << smu.average_umc_activity << "\n"; + std::cout << std::dec << "\taverage_mm_activity=" << smu.average_mm_activity << "\n"; + std::cout << std::dec << "\tvcn_activity= ["; + idx = 0; + for (const auto& temp : smu.vcn_activity) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.vcn_activity))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.jpeg_busy: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.jpeg_busy) { - if ((idy + 1) != static_cast(std::size(row.jpeg_busy))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << "\n"; + std::cout << std::dec << "\tjpeg_activity= ["; + idx = 0; + for (const auto& temp : smu.jpeg_activity) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.jpeg_activity))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_busy_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_busy_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_busy_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << "\n"; + std::cout << "POWER (W)/ENERGY (15.259uJ per 1ns):\n"; + std::cout << std::dec << "\taverage_socket_power=" << smu.average_socket_power << "\n"; + std::cout << std::dec << "\tcurrent_socket_power=" << smu.current_socket_power << "\n"; + std::cout << std::dec << "\tenergy_accumulator=" << smu.energy_accumulator << "\n"; + + std::cout << "\n"; + std::cout << "AVG CLOCKS (MHz):\n"; + std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency + << "\n"; + std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency + << "\n"; + std::cout << std::dec << "\taverage_uclk_frequency=" << smu.average_uclk_frequency << "\n"; + std::cout << std::dec << "\taverage_vclk0_frequency=" << smu.average_vclk0_frequency + << "\n"; + std::cout << std::dec << "\taverage_dclk0_frequency=" << smu.average_dclk0_frequency + << "\n"; + std::cout << std::dec << "\taverage_vclk1_frequency=" << smu.average_vclk1_frequency + << "\n"; + std::cout << std::dec << "\taverage_dclk1_frequency=" << smu.average_dclk1_frequency + << "\n"; + + std::cout << "\n"; + std::cout << "CURRENT CLOCKS (MHz):\n"; + std::cout << std::dec << "\tcurrent_gfxclk=" << smu.current_gfxclk << "\n"; + std::cout << std::dec << "\tcurrent_gfxclks= ["; + idx = 0; + for (const auto& temp : smu.current_gfxclks) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.current_gfxclks))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_below_host_limit_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_below_host_limit_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << std::dec << "\tcurrent_socclk=" << smu.current_socclk << "\n"; + std::cout << std::dec << "\tcurrent_socclks= ["; + idx = 0; + for (const auto& temp : smu.current_socclks) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.current_socclks))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - /*New scp stats v1.8*/ - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_below_host_limit_ppt_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_below_host_limit_ppt_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_ppt_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << std::dec << "\tcurrent_uclk=" << smu.current_uclk << "\n"; + std::cout << std::dec << "\tcurrent_vclk0=" << smu.current_vclk0 << "\n"; + std::cout << std::dec << "\tcurrent_vclk0s= ["; + idx = 0; + for (const auto& temp : smu.current_vclk0s) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.current_vclk0s))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_below_host_limit_thm_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_below_host_limit_thm_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_thm_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << std::dec << "\tcurrent_dclk0=" << smu.current_dclk0 << "\n"; + std::cout << std::dec << "\tcurrent_dclk0s= ["; + idx = 0; + for (const auto& temp : smu.current_dclk0s) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.current_dclk0s))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_low_utilization_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_low_utilization_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_low_utilization_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << std::dec << "\tcurrent_vclk1=" << smu.current_vclk1 << "\n"; + std::cout << std::dec << "\tcurrent_dclk1=" << smu.current_dclk1 << "\n"; + + std::cout << "\n"; + std::cout << "TROTTLE STATUS:\n"; + std::cout << std::dec << "\tthrottle_status=" << smu.throttle_status << "\n"; + + std::cout << "\n"; + std::cout << "FAN SPEED:\n"; + std::cout << std::dec << "\tcurrent_fan_speed=" << smu.current_fan_speed << "\n"; + + std::cout << "\n"; + std::cout << "LINK WIDTH (number of lanes) /SPEED (0.1 GT/s):\n"; + std::cout << "\tpcie_link_width=" << smu.pcie_link_width << "\n"; + std::cout << "\tpcie_link_speed=" << smu.pcie_link_speed << "\n"; + std::cout << "\txgmi_link_width=" << smu.xgmi_link_width << "\n"; + std::cout << "\txgmi_link_speed=" << smu.xgmi_link_speed << "\n"; + + std::cout << "\n"; + std::cout << "Utilization Accumulated(%):\n"; + std::cout << "\tgfx_activity_acc=" << std::dec << smu.gfx_activity_acc << "\n"; + std::cout << "\tmem_activity_acc=" << std::dec << smu.mem_activity_acc << "\n"; + + std::cout << "\n"; + std::cout << "XGMI ACCUMULATED DATA TRANSFER SIZE (KB):\n"; + std::cout << std::dec << "\txgmi_read_data_acc= ["; + idx = 0; + for (const auto& temp : smu.xgmi_read_data_acc) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.xgmi_read_data_acc))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - idx = 0; - idy = 0; - std::cout << "\txcp_stats.gfx_below_host_limit_total_acc: " << "\n"; - for (auto& row : smu.xcp_stats) { - std::cout << "\t XCP [" << idx << "] : ["; - for (auto& col : row.gfx_below_host_limit_total_acc) { - if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_total_acc))) { - std::cout << col << ", "; - } else { - std::cout << col; + std::cout << std::dec << "\txgmi_write_data_acc= ["; + idx = 0; + for (const auto& temp : smu.xgmi_write_data_acc) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.xgmi_write_data_acc))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; } - idy++; - } - std::cout << "]\n"; - idy = 0; - idx++; - } - std::cout << "\n\n"; - std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; - constexpr uint16_t kMAX_ITER_TEST = 10; - amdsmi_gpu_metrics_t gpu_metrics_check = {}; - for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { - amdsmi_get_gpu_metrics_info(processor_handles[device_index], &gpu_metrics_check); - std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " - << gpu_metrics_check.firmware_timestamp << "\n"; - } + std::cout << std::dec << "\txgmi_link_status= ["; + idx = 0; + for (const auto& temp : smu.xgmi_link_status) { + std::cout << temp; + if ((idx + 1) != static_cast(std::size(smu.xgmi_link_status))) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; + } - std::cout << "\n"; - for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { - amdsmi_get_gpu_metrics_info(processor_handles[device_index], &gpu_metrics_check); - std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " - << gpu_metrics_check.system_clock_counter << "\n"; - } + // Voltage (mV) + std::cout << "\tvoltage_soc = " << std::dec << smu.voltage_soc << "\n"; + std::cout << "\tvoltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; + std::cout << "\tvoltage_mem = " << std::dec << smu.voltage_mem << "\n"; - std::cout << "\n"; - std::cout << " ** Note: Values MAX'ed out " - << "(UINTX MAX are unsupported for the version in question) ** " << "\n\n"; + std::cout << "\tindep_throttle_status = " << std::dec << smu.indep_throttle_status << "\n"; + + // Clock Lock Status. Each bit corresponds to clock instance + std::cout << "\tgfxclk_lock_status (in hex) = " << std::hex + << smu.gfxclk_lock_status << std::dec <<"\n"; + + // Bandwidth (GB/sec) + std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; + std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; + + // VRAM max bandwidth at max memory clock + std::cout << "\tvram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; + + // Counts + std::cout << "\tpcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc + << "\n"; + std::cout << "\tpcie_replay_count_acc= " << std::dec << smu.pcie_replay_count_acc << "\n"; + std::cout << "\tpcie_replay_rover_count_acc= " << std::dec + << smu.pcie_replay_rover_count_acc << "\n"; + std::cout << "\tpcie_nak_sent_count_acc= " << std::dec << smu.pcie_nak_sent_count_acc + << "\n"; + std::cout << "\tpcie_nak_rcvd_count_acc= " << std::dec << smu.pcie_nak_rcvd_count_acc + << "\n"; + + // Accumulation cycle counter + // Accumulated throttler residencies + std::cout << "\n"; + std::cout << "RESIDENCY ACCUMULATION / COUNTER:\n"; + std::cout << "\taccumulation_counter = " << std::dec << smu.accumulation_counter << "\n"; + std::cout << "\tprochot_residency_acc = " << std::dec << smu.prochot_residency_acc << "\n"; + std::cout << "\tppt_residency_acc = " << std::dec << smu.ppt_residency_acc << "\n"; + std::cout << "\tsocket_thm_residency_acc = " << std::dec << smu.socket_thm_residency_acc + << "\n"; + std::cout << "\tvr_thm_residency_acc = " << std::dec << smu.vr_thm_residency_acc + << "\n"; + std::cout << "\thbm_thm_residency_acc = " << std::dec << smu.hbm_thm_residency_acc << "\n"; + + // Number of current partitions + std::cout << "\tnum_partition = " << std::dec << smu.num_partition << "\n"; + + // PCIE other end recovery counter + std::cout << "\tpcie_lc_perf_other_end_recovery = " + << std::dec << smu.pcie_lc_perf_other_end_recovery << "\n"; + + idx = 0; + auto idy = 0; + std::cout << "\txcp_stats.gfx_busy_inst: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_busy_inst) { + if ((idy + 1) != static_cast(std::size(row.gfx_busy_inst))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.vcn_busy: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.vcn_busy) { + if ((idy + 1) != static_cast(std::size(row.vcn_busy))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.jpeg_busy: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.jpeg_busy) { + if ((idy + 1) != static_cast(std::size(row.jpeg_busy))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_busy_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_busy_acc) { + if ((idy + 1) != static_cast(std::size(row.gfx_busy_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_below_host_limit_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_below_host_limit_acc) { + if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + /*New scp stats v1.8*/ + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_below_host_limit_ppt_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_below_host_limit_ppt_acc) { + if ((idy + 1) != static_cast( + std::size(row.gfx_below_host_limit_ppt_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_below_host_limit_thm_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_below_host_limit_thm_acc) { + if ((idy + 1) != static_cast( + std::size(row.gfx_below_host_limit_thm_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_low_utilization_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_low_utilization_acc) { + if ((idy + 1) != static_cast(std::size(row.gfx_low_utilization_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_below_host_limit_total_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_below_host_limit_total_acc) { + if ((idy + 1) != static_cast( + std::size(row.gfx_below_host_limit_total_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + + std::cout << "\n\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + amdsmi_gpu_metrics_t gpu_metrics_check = {}; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles[device_index], + &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" + << kMAX_ITER_TEST << "]: " + << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles[device_index], + &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" + << kMAX_ITER_TEST << "]: " + << gpu_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n"; + std::cout << " ** Note: Values MAX'ed out " + << "(UINTX MAX are unsupported for the version in question) ** " + << "\n\n"; + } // END GPU METRICS OUTPUTS // Get nearest GPUs const char *topology_link_type_str[] = { @@ -1405,18 +1836,18 @@ int main() { "AMDSMI_LINK_TYPE_UNKNOWN", }; printf("\tOutput of amdsmi_get_link_topology_nearest:\n"); - for (uint32_t topo_link_type = AMDSMI_LINK_TYPE_INTERNAL; topo_link_type <= AMDSMI_LINK_TYPE_UNKNOWN; topo_link_type++) { + for (uint32_t topo_link_type = AMDSMI_LINK_TYPE_INTERNAL; + topo_link_type <= AMDSMI_LINK_TYPE_UNKNOWN; topo_link_type++) { auto topology_nearest_info = amdsmi_topology_nearest_t(); ret = amdsmi_get_link_topology_nearest(processor_handles[device_index], - static_cast(topo_link_type), - nullptr); + static_cast(topo_link_type), nullptr); if (ret != AMDSMI_STATUS_INVAL) { CHK_AMDSMI_RET(ret); } ret = amdsmi_get_link_topology_nearest(processor_handles[device_index], - static_cast(topo_link_type), - &topology_nearest_info); + static_cast(topo_link_type), + &topology_nearest_info); if (ret != AMDSMI_STATUS_INVAL) { CHK_AMDSMI_RET(ret); } @@ -1426,16 +1857,19 @@ int main() { for (uint32_t k = 0; k < topology_nearest_info.count; k++) { amdsmi_bdf_t bdf = {}; ret = amdsmi_get_gpu_device_bdf(topology_nearest_info.processor_list[k], &bdf); - if (ret != AMDSMI_STATUS_INVAL) { - CHK_AMDSMI_RET(ret); + PRINT_AMDSMI_RET(ret) + if (ret == AMDSMI_STATUS_SUCCESS) { + printf("\t\tGPU BDF %04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32 "\n", + static_cast(bdf.domain_number), + static_cast(bdf.bus_number), + static_cast(bdf.device_number), + static_cast(bdf.function_number)); + } else { + printf("\t\tGPU BDF not available\n"); } - printf("\t\tGPU BDF %04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32 "\n", - static_cast(bdf.domain_number), - static_cast(bdf.bus_number), - static_cast(bdf.device_number), - static_cast(bdf.function_number)); } } + gpu_number++; } } diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 6a232afbff..963a979955 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -845,7 +845,7 @@ typedef struct { uint64_t device_id; //!< The device ID of a GPU uint32_t rev_id; //!< The revision ID of a GPU char asic_serial[AMDSMI_MAX_STRING_LENGTH]; - uint32_t oam_id; //!< 0xFFFF if not supported + uint32_t oam_id; //!< 0xFFFFFFFF if not supported uint32_t num_of_compute_units; //!< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //!< 0xFFFFFFFFFFFFFFFF if not supported uint32_t subsystem_id; //!> The subsystem ID diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h index b9443ea50f..1014237a44 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h @@ -29,6 +29,7 @@ #include #include #include +#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_gpu_device.h" @@ -58,6 +59,24 @@ std::string smi_split_string(std::string str, char delim); std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus); amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len, std::string newString); + +/** + * @brief Opens a file descriptor for the specified path with RAII semantics and caching. + * + * This function attempts to open a file descriptor (FD) for the given file path and flags. + * It maintains a cache of weak pointers to previously opened FDs, allowing for reuse of + * file descriptors if they are still valid. If a valid FD for the path exists in the cache, + * it is reused; otherwise, a new FD is opened. The returned FD is managed by a std::shared_ptr + * with a custom deleter that ensures the FD is properly closed when no longer in use. + * + * Thread safety is ensured via a static mutex. + * + * @param path The file system path to open. + * @param flags Flags to use when opening the file (as per open(2)). + * @return std::shared_ptr Shared pointer managing the file descriptor, or nullptr on failure. + */ +std::shared_ptr amdsmi_RAII_FD_handler(const std::string& path, int flags); + /** * @brief Wait for user input, a debugging function to pause the program * diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index d4787f46ff..4db768897e 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -1879,18 +1879,24 @@ def amdsmi_get_gpu_asic_info( market_name = _pad_hex_value(asic_info_struct.market_name.decode("utf-8"), 4) target_graphics_version = hex(asic_info_struct.target_graphics_version)[2:] + subsystem_id = _validate_if_max_uint(asic_info_struct.subsystem_id, MaxUIntegerTypes.UINT32_T) + subvendor_id = _validate_if_max_uint(asic_info_struct.subvendor_id, MaxUIntegerTypes.UINT32_T) + if subsystem_id is not "N/A": + subsystem_id = _pad_hex_value(hex(subsystem_id), 4) + if subvendor_id is not "N/A": + subvendor_id = _pad_hex_value(hex(subvendor_id), 4) asic_info = { "market_name": market_name, "vendor_id": asic_info_struct.vendor_id, "vendor_name": asic_info_struct.vendor_name.decode("utf-8"), - "subvendor_id": asic_info_struct.subvendor_id, + "subvendor_id": subvendor_id, "device_id": asic_info_struct.device_id, "rev_id": _pad_hex_value(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), - "oam_id": asic_info_struct.oam_id, - "num_compute_units": asic_info_struct.num_of_compute_units, + "oam_id": _validate_if_max_uint(asic_info_struct.oam_id, MaxUIntegerTypes.UINT32_T), + "num_compute_units": _validate_if_max_uint(asic_info_struct.num_of_compute_units, MaxUIntegerTypes.UINT32_T), "target_graphics_version": "gfx" + target_graphics_version, - "subsystem_id": asic_info_struct.subsystem_id + "subsystem_id": subsystem_id } string_values = ["market_name", "vendor_name"] @@ -1898,7 +1904,7 @@ def amdsmi_get_gpu_asic_info( if not asic_info[value]: asic_info[value] = "N/A" - hex_values = ["vendor_id", "subvendor_id", "device_id", "subsystem_id"] + hex_values = ["vendor_id", "device_id"] for value in hex_values: if asic_info[value]: asic_info[value] = hex(asic_info[value]) @@ -1913,14 +1919,6 @@ def amdsmi_get_gpu_asic_info( else: asic_info["asic_serial"] = "N/A" - # Check for max value as a sign for not applicable - if asic_info["oam_id"] == 0xFFFF: # uint 16 max - asic_info["oam_id"] = "N/A" - - # Check for max value as a sign for not applicable - if asic_info["num_compute_units"] == 0xFFFFFFFF: # uint 32 max - asic_info["num_compute_units"] = "N/A" - # Remove commas from vendor name for clean output asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '') @@ -2834,9 +2832,9 @@ def amdsmi_get_fw_info( 'fw_name': fw_name, 'fw_version': fw_version_string.upper(), }) - return { - 'fw_list': firmwares - } + return_dict = {'fw_list': firmwares} + # logging.debug("amdsmi_interface.py | amdsmi_get_fw_info | return_dictionary = \n" + str(json.dumps(return_dict, indent=4))) + return return_dict def amdsmi_get_gpu_vram_usage( @@ -3314,6 +3312,11 @@ def amdsmi_get_gpu_memory_partition_config(processor_handle: amdsmi_wrapper.amds mem_caps_list.append("NPS4") if config.partition_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") + if (config.partition_caps.nps_flags.nps1_cap == 0 and + config.partition_caps.nps_flags.nps2_cap == 0 and + config.partition_caps.nps_flags.nps4_cap == 0 and + config.partition_caps.nps_flags.nps8_cap == 0): + mem_caps_list.append("N/A") return_dict = { "partition_caps": mem_caps_list, @@ -3421,6 +3424,11 @@ def amdsmi_get_gpu_accelerator_partition_profile( mem_caps_list.append("NPS4") if profile.memory_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") + if (profile.memory_caps.nps_flags.nps1_cap == 0 and + profile.memory_caps.nps_flags.nps2_cap == 0 and + profile.memory_caps.nps_flags.nps4_cap == 0 and + profile.memory_caps.nps_flags.nps8_cap == 0): + mem_caps_list.append("N/A") partition_profile_dict = { "profile_type" : profile_type_ret, "num_partitions" : profile.num_partitions, @@ -3473,6 +3481,11 @@ def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi mem_caps_list.append("NPS4") if profile.memory_caps.nps_flags.nps8_cap == 1: mem_caps_list.append("NPS8") + if (profile.memory_caps.nps_flags.nps1_cap == 0 and + profile.memory_caps.nps_flags.nps2_cap == 0 and + profile.memory_caps.nps_flags.nps4_cap == 0 and + profile.memory_caps.nps_flags.nps8_cap == 0): + mem_caps_list.append("N/A") for r in range(config.num_resource_profiles): # logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | i = " + str(i) + "; r = " + str(r) + "; resource_idx = " + str(resource_idx)) diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index 8eceb66da9..ff60439c05 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -344,10 +344,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, return amd::smi::ErrnoToRsmiStatus(ret); } + if (val_str.empty()) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: SYSFS read was empty" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + if (!amd::smi::IsInteger(val_str)) { std::ostringstream ss; - ss << "Expected integer value from monitor, but got \"" << val_str << "\""; - LOG_ERROR(ss); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: Expected integer value from monitor, but got "<< val_str + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); return RSMI_STATUS_UNEXPECTED_DATA; } @@ -374,10 +395,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, return amd::smi::ErrnoToRsmiStatus(ret); } + if (val_str.empty()) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: SYSFS read was empty" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + if (!amd::smi::IsInteger(val_str)) { std::ostringstream ss; - ss << "Expected integer value from monitor, but got \"" << val_str << "\""; - LOG_ERROR(ss); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(type) + << " | Cause: Expected integer value from monitor, but got "<< val_str + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |"; + LOG_INFO(ss); return RSMI_STATUS_UNEXPECTED_DATA; } @@ -806,12 +848,13 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { TRY rsmi_status_t ret; - CHK_SUPPORT_NAME_ONLY(numa_node) - DEVICE_MUTEX + if (!numa_node) { + return RSMI_STATUS_INVALID_ARGS; + } std::string str_val; ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); - if (ret != RSMI_STATUS_SUCCESS){ + if (ret != RSMI_STATUS_SUCCESS) { return ret; } *numa_node = std::stoi(str_val, nullptr); @@ -1060,7 +1103,11 @@ rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) - return get_id(dv_ind, amd::smi::kDevSubSysDevID, id); + auto ret = get_id(dv_ind, amd::smi::kDevSubSysDevID, id); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret, false); + LOG_INFO(ss); + return ret; } rsmi_status_t @@ -1069,6 +1116,9 @@ rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); + if (!id) { + return RSMI_STATUS_INVALID_ARGS; + } CHK_SUPPORT_NAME_ONLY(id) int ret_kfd = 0; uint32_t node_id; @@ -1143,8 +1193,11 @@ rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - CHK_SUPPORT_NAME_ONLY(perf) DEVICE_MUTEX + if (!perf) { + return RSMI_STATUS_INVALID_ARGS; + } + CHK_SUPPORT_NAME_ONLY(perf) rsmi_status_t ret = get_dev_value_str(amd::smi::kDevPerfLevel, dv_ind, &val_str); @@ -2811,17 +2864,17 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - CHK_SUPPORT_NAME_ONLY(name) - if (len == 0) { + if (len == 0 || !name) { return RSMI_STATUS_INVALID_ARGS; } + CHK_SUPPORT_NAME_ONLY(name) DEVICE_MUTEX ret = get_dev_name_from_file(dv_ind, name, len); - if (ret || name[0] == '\0' || !isprint(name[0]) ) { + if (ret || name[0] == '\0' || !isprint(name[0])) { ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_DEVICE); } @@ -3850,6 +3903,9 @@ rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) { LOG_TRACE(ss); ++sensor_ind; // power sysfs files have 1-based indices + if (!cap) { + return RSMI_STATUS_INVALID_ARGS; + } CHK_SUPPORT_SUBVAR_ONLY(cap, sensor_ind) rsmi_status_t ret; @@ -3870,6 +3926,9 @@ rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind, LOG_TRACE(ss); ++sensor_ind; // power sysfs files have 1-based indices + if (max == nullptr || min == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } CHK_SUPPORT_SUBVAR_ONLY((min == nullptr || max == nullptr ?nullptr : min), sensor_ind) rsmi_status_t ret; @@ -3993,6 +4052,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, } DEVICE_MUTEX + *total = 0; // Initialize total to 0 + // This is needed to avoid returning garbage value in case of failure ret = get_dev_value_int(mem_type_file, dv_ind, total); // Fallback to KFD reported memory if VRAM total is 0 or sysfs read fails @@ -4070,6 +4131,8 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, } DEVICE_MUTEX + *used = 0; // Initialize used to 0 + // This is needed to avoid returning garbage value in case of failure ret = get_dev_value_int(mem_type_file, dv_ind, used); // Fallback to KFD reported memory if no VRAM or sysfs read fails @@ -4652,10 +4715,8 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - CHK_SUPPORT_NAME_ONLY(unique_id) - DEVICE_MUTEX - if (unique_id == nullptr) { + if (!unique_id) { return RSMI_STATUS_INVALID_ARGS; } *unique_id = std::numeric_limits::max(); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index d9b88f5324..9fa357c694 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -806,7 +806,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (ret != 0 || !reg_file) { ss << __PRETTY_FUNCTION__ << " | Adjusted file path also does not exist - SYSFS file (" - << sysfs_path + << sysfs_path << ") for DevInfoInfoType (" << get_type_string(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); @@ -865,8 +865,8 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { ret = openDebugFileStream(type, &fs); if (ret != 0) { ss << "Could not read debugInfoStr for DevInfoType (" - << get_type_string(type)<< "), returning " - << std::to_string(ret); + << get_type_string(type) << "), returning " + << std::to_string(ret); LOG_ERROR(ss); return ret; } @@ -879,7 +879,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { fs.close(); ss << "Successfully read debugInfoStr for DevInfoType (" - << get_type_string(type)<< "), retString= " << *retStr; + << get_type_string(type) << "), retString= " << *retStr; LOG_INFO(ss); return 0; @@ -904,8 +904,8 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { fs >> *retStr; fs.close(); ss << __PRETTY_FUNCTION__ - << "Successfully read device info string for DevInfoType (" << - get_type_string(type) << "): " + *retStr + << "Successfully read device info string for DevInfoType (" + << get_type_string(type) << "): " + *retStr << " | " << (fs.is_open() ? " File stream is opened" : " File stream is closed") << " | " << (fs.bad() ? "[ERROR] Bad read operation" : @@ -1078,7 +1078,6 @@ const char* Device::get_type_string(DevInfoTypes type) { } return "Unknown"; - } int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 997aaf868e..0722f898df 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -4537,8 +4538,24 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { dev->set_smi_partition_id(0); } - dev->dev_log_gpu_metrics(ostrstream); + // check if file exists, report not supported if it does not exist + std::string file_name = "/sys/class/drm/card" + + std::to_string(dev->index()) + + "/device/gpu_metrics"; + if (access(file_name.c_str(), F_OK | R_OK) != 0) { + status_code = RSMI_STATUS_NOT_SUPPORTED; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_ERROR(ss); + return status_code; + } + dev->dev_log_gpu_metrics(ostrstream); const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics(); if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index a49c66b242..9e3acda49a 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -898,24 +898,28 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()); std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; - std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + + std::string path = "/dev/dri/" + render_name; + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); ss << __PRETTY_FUNCTION__ << " | open(" << path << ") returned: " << strerror(errno) << "\n" - << " | drm_fd: " << std::dec << drm_fd << "\n" + << " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n" << " | render_name: " << render_name << "\n"; LOG_INFO(ss); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } amd::smi::AMDSmiLibraryLoader libdrm; amdsmi_status_t status = libdrm.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load libdrm.so.2: " << strerror(errno) @@ -938,7 +942,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand status = libdrm.load_symbol(reinterpret_cast(&drmCommandWrite), "drmCommandWrite"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load drmCommandWrite symbol" @@ -950,7 +953,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand << " | drmCommandWrite symbol loaded successfully"; LOG_INFO(ss); - uint64_t total = 0; r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0, RSMI_MEM_TYPE_VRAM, &total); @@ -964,10 +966,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand request.return_pointer = reinterpret_cast(&vram_used); request.return_size = sizeof(vram_used); request.query = AMDGPU_INFO_VRAM_USAGE; - auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request, - sizeof(struct drm_amdgpu_info)); + auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request, + sizeof(struct drm_amdgpu_info)); if (drm_write != 0) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Issue - drm_write failed, drm_write (AMDGPU_INFO_VRAM_USAGE): " @@ -978,7 +979,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand } vram_info->vram_used = static_cast(vram_used / (1024 * 1024)); - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | vram_info->vram_total (MB): " << std::dec << vram_info->vram_total << "\n" @@ -1531,6 +1531,19 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i uint16_t subvendor_id = 0; uint16_t device_id = 0; uint16_t subsystem_id = 0; + char temp_market_name[AMDSMI_MAX_STRING_LENGTH] = {0}; + smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH, temp_market_name); + info->market_name[0] = '\0'; + info->vendor_id = std::numeric_limits::max(); + info->vendor_name[0] = '\0'; + info->subvendor_id = std::numeric_limits::max(); + info->device_id = std::numeric_limits::max(); + info->rev_id = std::numeric_limits::max(); + info->asic_serial[0] = '\0'; + info->oam_id = std::numeric_limits::max(); + info->num_of_compute_units = std::numeric_limits::max(); + info->target_graphics_version = std::numeric_limits::max(); + info->subsystem_id = std::numeric_limits::max(); std::ostringstream ss; amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; @@ -1539,80 +1552,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i return r; } SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) - amdsmi_status_t status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); - if (status != AMDSMI_STATUS_SUCCESS) { - rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, - info->market_name, AMDSMI_MAX_STRING_LENGTH); - } - - std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; - std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); - return AMDSMI_STATUS_NOT_SUPPORTED; - } - ss << __PRETTY_FUNCTION__ - << " | open(" << path << ") returned: " << strerror(errno) << "\n" - << " | drm_fd: " << std::dec << drm_fd << "\n" - << " | render_name: " << render_name << "\n"; - LOG_INFO(ss); - - amd::smi::AMDSmiLibraryLoader libdrm; - status = libdrm.load("libdrm.so.2"); - if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); - libdrm.unload(); - ss << __PRETTY_FUNCTION__ - << " | Failed to load libdrm.so.2: " << strerror(errno) - << "; Returning: " << smi_amdgpu_get_status_string(status, false); - LOG_ERROR(ss); - return status; - } - - // extern int drmCommandWrite(int fd, unsigned long drmCommandIndex, - // void *data, unsigned long size); - typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex, - void *data, unsigned long size); - drmCommandWrite_t drmCommandWrite = nullptr; - - // load symbol from libdrm - status = libdrm.load_symbol(reinterpret_cast(&drmCommandWrite), - "drmCommandWrite"); - if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); - libdrm.unload(); - ss << __PRETTY_FUNCTION__ - << " | Failed to load drmCommandWrite symbol" - << " | Returning: " << smi_amdgpu_get_status_string(status, false); - LOG_ERROR(ss); - return status; - } - - // Get the device info - memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device)); - struct drm_amdgpu_info request = {}; - memset(&request, 0, sizeof(request)); - request.return_pointer = reinterpret_cast(&dev_info); - request.return_size = sizeof(struct drm_amdgpu_info_device); - request.query = AMDGPU_INFO_DEV_INFO; - auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request, - sizeof(struct drm_amdgpu_info)); - if (drm_write != 0) { - libdrm.unload(); - close(drm_fd); - ss << __PRETTY_FUNCTION__ - << " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n" - << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false); - LOG_ERROR(ss); - return AMDSMI_STATUS_DRM_ERROR; - } - // TODO(cpoag): check if this is correct, might be able to go through KGD/KFD - info->rev_id = static_cast(dev_info.pci_rev); - libdrm.unload(); - close(drm_fd); /** * For other sysfs related information, get from rocm-smi @@ -1622,7 +1561,8 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i std::string max_uint64_str = "ffffffffffffffff"; smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, max_uint64_str); uint64_t device_uuid = 0; - status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &device_uuid); + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, + &device_uuid); if (status == AMDSMI_STATUS_SUCCESS) { ss.clear(); ss << std::hex << std::setw(16) << std::setfill('0') << device_uuid; @@ -1647,31 +1587,32 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i &subsystem_id); if (status == AMDSMI_STATUS_SUCCESS) info->subsystem_id = subsystem_id; + char temp_vendor_name[AMDSMI_MAX_STRING_LENGTH] = {0}; status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, 0, - info->vendor_name, AMDSMI_MAX_STRING_LENGTH); + temp_vendor_name, AMDSMI_MAX_STRING_LENGTH); + if (status == AMDSMI_STATUS_SUCCESS) { + smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH, + temp_vendor_name); + } - // default to 0xffff as not supported - info->oam_id = std::numeric_limits::max(); uint16_t tmp_oam_id = 0; status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, 0, &(tmp_oam_id)); - info->oam_id = tmp_oam_id; + if (status == AMDSMI_STATUS_SUCCESS) { + info->oam_id = tmp_oam_id; + } - // default to 0xffffffff as not supported - info->num_of_compute_units = std::numeric_limits::max(); auto tmp_num_of_compute_units = uint32_t(0); status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, 0, &(tmp_num_of_compute_units)); - if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + if (status == AMDSMI_STATUS_SUCCESS) { info->num_of_compute_units = tmp_num_of_compute_units; } - // default to 0xffffffffffffffff as not supported - info->target_graphics_version = std::numeric_limits::max(); auto tmp_target_gfx_version = uint64_t(0); status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, 0, &(tmp_target_gfx_version)); - if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + if (status == AMDSMI_STATUS_SUCCESS) { info->target_graphics_version = tmp_target_gfx_version; } @@ -1685,16 +1626,12 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i LOG_INFO(ss); if (status == AMDSMI_STATUS_SUCCESS) { info->device_id = static_cast(device_id); - } else { - info->device_id = std::numeric_limits::max(); } info->rev_id = dev_info.pci_rev; status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0, &vendor_id); if (status == AMDSMI_STATUS_SUCCESS) { info->vendor_id = vendor_id; - } else { - info->vendor_id = std::numeric_limits::max(); } // If vendor name is empty and the vendor id is 0x1002, set vendor name to AMD vendor string @@ -1703,6 +1640,95 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH, amd_name); } + status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); + if (status != AMDSMI_STATUS_SUCCESS) { + status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, + temp_market_name, AMDSMI_MAX_STRING_LENGTH); + if (status == AMDSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_brand_get() returned: " + << smi_amdgpu_get_status_string(status, false) << "\n" + << " ; temp_market_name: " << temp_market_name << "\n"; + LOG_INFO(ss); + smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH, + temp_market_name); + } else { + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_brand_get() failed: " + << smi_amdgpu_get_status_string(status, false) << "\n"; + LOG_INFO(ss); + } + } + + std::string render_name = gpu_device->get_gpu_path(); + if (render_name.empty()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + std::string path = "/dev/dri/" + render_name; + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); + ss << __PRETTY_FUNCTION__ + << " | open(" << path << ") returned: " << strerror(errno) << "\n" + << " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n" + << " | render_name: " << render_name << "\n"; + LOG_INFO(ss); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } + + amd::smi::AMDSmiLibraryLoader libdrm; + status = libdrm.load("libdrm.so.2"); + if (status != AMDSMI_STATUS_SUCCESS) { + libdrm.unload(); + ss << __PRETTY_FUNCTION__ + << " | Failed to load libdrm.so.2: " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(status, false); + LOG_ERROR(ss); + return status; + } + + // extern int drmCommandWrite(int fd, unsigned long drmCommandIndex, + // void *data, unsigned long size); + typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size); + drmCommandWrite_t drmCommandWrite = nullptr; + + // load symbol from libdrm + status = libdrm.load_symbol(reinterpret_cast(&drmCommandWrite), + "drmCommandWrite"); + if (status != AMDSMI_STATUS_SUCCESS) { + libdrm.unload(); + ss << __PRETTY_FUNCTION__ + << " | Failed to load drmCommandWrite symbol" + << " | Returning: " << smi_amdgpu_get_status_string(status, false); + LOG_ERROR(ss); + return status; + } + + // Get the device info + memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device)); + struct drm_amdgpu_info request = {}; + memset(&request, 0, sizeof(request)); + request.return_pointer = reinterpret_cast(&dev_info); + request.return_size = sizeof(struct drm_amdgpu_info_device); + request.query = AMDGPU_INFO_DEV_INFO; + auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request, + sizeof(struct drm_amdgpu_info)); + if (drm_write != 0) { + libdrm.unload(); + ss << __PRETTY_FUNCTION__ + << " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n" + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_DRM_ERROR; + } + // TODO(cpoag): check if this is correct, might be able to go through KGD/KFD + info->rev_id = static_cast(dev_info.pci_rev); + libdrm.unload(); + ss << __PRETTY_FUNCTION__ << " | info->market_name: " << info->market_name << "\n" << " | info->vendor_id (dec): " << std::dec << info->vendor_id << "\n" @@ -1862,24 +1888,28 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()); std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); ss << __PRETTY_FUNCTION__ << " | open(" << path << ") returned: " << strerror(errno) << "\n" - << " | drm_fd: " << std::dec << drm_fd << "\n" + << " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n" << " | render_name: " << render_name << "\n"; LOG_INFO(ss); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } amd::smi::AMDSmiLibraryLoader libdrm; amdsmi_status_t status = libdrm.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load libdrm.so.2: " << strerror(errno) @@ -1902,7 +1932,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( status = libdrm.load_symbol(reinterpret_cast(&drmCommandWrite), "drmCommandWrite"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load drmCommandWrite symbol" @@ -1921,10 +1950,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( request.return_pointer = reinterpret_cast(&dev_info); request.return_size = sizeof(struct drm_amdgpu_info_device); request.query = AMDGPU_INFO_DEV_INFO; - auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request, + auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request, sizeof(struct drm_amdgpu_info)); if (drm_write != 0) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n" @@ -1935,8 +1963,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( info->vram_type = amd::smi::vram_type_value(dev_info.vram_type); info->vram_bit_width = dev_info.vram_bit_width; - close(drm_fd); libdrm.unload(); + // if vram type is greater than the max enum set it to unknown + if (info->vram_type > AMDSMI_VRAM_TYPE__MAX) info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN; // set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set amdsmi_gpu_metrics_t metric_info = {}; @@ -1945,10 +1974,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( info->vram_max_bandwidth = metric_info.vram_max_bandwidth; } - // if vram type is greater than the max enum set it to unknown - if (info->vram_type > AMDSMI_VRAM_TYPE__MAX) - info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN; - // map the vendor name to enum char brand[256] = {'\0'}; r = rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0, brand, 255); @@ -2949,8 +2974,8 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h constexpr uint32_t kCurrentPartitionSize = 5; char current_partition[kCurrentPartitionSize]; std::string current_partition_str = "N/A"; - status = amdsmi_get_gpu_compute_partition(processor_handle, current_partition, - kCurrentPartitionSize); + amdsmi_status_t compute_status = amdsmi_get_gpu_compute_partition(processor_handle, + current_partition, kCurrentPartitionSize); ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_compute_partition() current_partition = |" << current_partition << "|"; LOG_DEBUG(ss); @@ -2965,7 +2990,8 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h if (accelerator_capabilities.find(current_partition_str) != std::string::npos) { auto it = std::find(tokens.begin(), tokens.end(), current_partition_str); if (it != tokens.end()) { - profile->profile_index = static_cast(std::distance(tokens.begin(), it)); + profile->profile_index = static_cast(std::distance( + tokens.begin(), it)); } } } @@ -3056,7 +3082,7 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h profile->memory_caps = flags; ss << __PRETTY_FUNCTION__ - << " | END returning " << smi_amdgpu_get_status_string(status, false) << "\n" + << " | END returning " << smi_amdgpu_get_status_string(compute_status, false) << "\n" << " | accelerator_capabilities: " << accelerator_capabilities << "\n" << " | current_partition_str: " << current_partition_str << "\n" << " | std::vector tokens: " << ss_1.str() << "\n" @@ -3072,7 +3098,9 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h << " | partition_id: " << ss_2.str(); LOG_INFO(ss); - return status; + return compute_status; // only return status from amdsmi_get_gpu_compute_partition + // as this is the only function that can fail + // if the device does not support partitions } amdsmi_status_t @@ -3373,7 +3401,9 @@ amdsmi_status_t amdsmi_status_t amdsmi_get_gpu_perf_level(amdsmi_processor_handle processor_handle, amdsmi_dev_perf_level_t *perf) { AMDSMI_CHECK_INIT(); - // nullptr api supported + if (!perf) { + return AMDSMI_STATUS_INVAL; + } return rsmi_wrapper(rsmi_dev_perf_level_get, processor_handle, 0, reinterpret_cast(perf)); @@ -3681,6 +3711,9 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id( amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity( amdsmi_processor_handle processor_handle, int32_t *numa_node) { + if (!numa_node) { + return AMDSMI_STATUS_INVAL; + } return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, 0, numa_node); } @@ -3716,24 +3749,28 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()); std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); ss << __PRETTY_FUNCTION__ << " | open(" << path << ") returned: " << strerror(errno) << "\n" - << " | drm_fd: " << std::dec << drm_fd << "\n" + << " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n" << " | render_name: " << render_name << "\n"; LOG_INFO(ss); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } amd::smi::AMDSmiLibraryLoader libdrm; status = libdrm.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load libdrm.so.2: " << strerror(errno) @@ -3757,7 +3794,6 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios "drmCommandWrite"); if (status != AMDSMI_STATUS_SUCCESS) { libdrm.unload(); - close(drm_fd); ss << __PRETTY_FUNCTION__ << " | Failed to load drmCommandWrite symbol" << " | Returning: " << smi_amdgpu_get_status_string(status, false); @@ -3775,7 +3811,7 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios request.return_size = sizeof(drm_amdgpu_info_vbios); request.query = AMDGPU_INFO_VBIOS; request.vbios_info.type = AMDGPU_INFO_VBIOS_INFO; - auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request, + auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request, sizeof(struct drm_amdgpu_info)); if (drm_write == 0) { @@ -3799,7 +3835,6 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios vbios_version, AMDSMI_MAX_STRING_LENGTH); } } - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | drmCommandWrite returned: " << strerror(errno) << "\n" @@ -4283,19 +4318,22 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } amd::smi::AMDSmiLibraryLoader libdrm; status = libdrm.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load libdrm.so.2" @@ -4313,7 +4351,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han status = libdrm.load_symbol( reinterpret_cast(&drm_get_version), "drmGetVersion"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load drmGetVersion symbol" @@ -4324,7 +4361,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han status = libdrm.load_symbol( reinterpret_cast(&drm_free_version), "drmFreeVersion"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load drmFreeVersion symbol" @@ -4335,9 +4371,8 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han // Get the driver date std::string driver_date; - auto version = drm_get_version(drm_fd); + auto version = drm_get_version(*drm_fd); if (version == nullptr) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to get driver version" @@ -4358,7 +4393,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han std::string driver_name = version->name; strncpy(info->driver_name, driver_name.c_str(), AMDSMI_MAX_STRING_LENGTH-1); drm_free_version(version); - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Driver version: " << info->driver_version << "\n" @@ -4402,9 +4436,9 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a } else { ss << __PRETTY_FUNCTION__ << " | Failed to open file: " << path_max_link_width - << " | returning AMDSMI_STATUS_API_FAILED"; + << " | returning AMDSMI_STATUS_NOT_SUPPORTED"; LOG_ERROR(ss); - return AMDSMI_STATUS_API_FAILED; + return AMDSMI_STATUS_NOT_SUPPORTED; } info->pcie_static.max_pcie_width = (uint16_t)pcie_width; @@ -4760,19 +4794,23 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) std::string render_name = gpu_device->get_gpu_path(); - int drm_fd = -1; std::string path = "/dev/dri/" + render_name; - if (render_name != "") { - drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - close(drm_fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); + if (!drm_fd) { + ss << __PRETTY_FUNCTION__ + << " | Failed to open " << path << ": " << strerror(errno) + << "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false); + LOG_ERROR(ss); + return AMDSMI_STATUS_FILE_ERROR; + } + amd::smi::AMDSmiLibraryLoader libdrm; status = libdrm.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load libdrm.so.2" @@ -4790,9 +4828,7 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, status = libdrm.load_symbol(reinterpret_cast(&drm_get_version), "drmGetVersion"); if (status != AMDSMI_STATUS_SUCCESS) { - drm_get_version = nullptr; libdrm.unload(); - close(drm_fd); ss << __PRETTY_FUNCTION__ << " | Failed to load drmGetVersion symbol" << "; Returning: " << smi_amdgpu_get_status_string(status, false); @@ -4806,7 +4842,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, if (status != AMDSMI_STATUS_SUCCESS) { drm_free_version = nullptr; libdrm.unload(); - close(drm_fd); ss << __PRETTY_FUNCTION__ << " | Failed to load drmFreeVersion symbol" << "; Returning: " << smi_amdgpu_get_status_string(status, false); @@ -4815,7 +4850,7 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, } // get drm version. If it's older than 3.62.0, then say not supported and exit. - auto drm_version = drm_get_version(drm_fd); + auto drm_version = drm_get_version(*drm_fd); // minimum version that supports getting of virtualization mode int major_version = 3; int minor_version = 62; @@ -4840,7 +4875,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, // If not, then return not supported if (isDRMVersionSupported == false) { drm_free_version(drm_version); - close(drm_fd); libdrm.unload(); return AMDSMI_STATUS_NOT_SUPPORTED; } @@ -4855,11 +4889,10 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, "drmCommandWrite"); if (status != AMDSMI_STATUS_SUCCESS) { drm_free_version(drm_version); - close(drm_fd); libdrm.unload(); ss << __PRETTY_FUNCTION__ << " | Failed to load drmCommandWrite symbol: " << strerror(errno) - << " | returning AMDSMI_STATUS_DRM_ERROR"; + << "; Returning: " << smi_amdgpu_get_status_string(status, false); LOG_ERROR(ss); return status; } @@ -4871,10 +4904,10 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, request.return_pointer = reinterpret_cast(&dev_info); request.return_size = sizeof(struct drm_amdgpu_info_device); request.query = AMDGPU_INFO_DEV_INFO; - auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request, + auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request, sizeof(struct drm_amdgpu_info)); ss << __PRETTY_FUNCTION__ - << " | drm_fd: " << std::dec << drm_fd << "\n" + << " | drm_fd: " << std::dec << *drm_fd << "\n" << " | path: " << path << "\n" << " | drmCommandWrite: " << drm_write << "\n" << " | drmCommandWrite returned: " << strerror(errno) << "\n" @@ -4917,7 +4950,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, status = AMDSMI_STATUS_DRM_ERROR; } drm_free_version(drm_version); - close(drm_fd); libdrm.unload(); return status; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_drm.cc b/projects/amdsmi/src/amd_smi/amd_smi_drm.cc index 668c409022..00add9d4e1 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_drm.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_drm.cc @@ -27,6 +27,7 @@ #include #include #include +#include "amd_smi/impl/amd_smi_utils.h" #include "amd_smi/impl/amd_smi_drm.h" #include "amd_smi/impl/amd_smi_common.h" #include "rocm_smi/rocm_smi.h" @@ -59,8 +60,6 @@ std::string AMDSmiDrm::find_file_in_folder(const std::string& folder, amdsmi_status_t AMDSmiDrm::init() { std::ostringstream ss; - int fd = -1; - amdsmi_status_t status = lib_loader_.load("libdrm.so.2"); if (status != AMDSMI_STATUS_SUCCESS) { @@ -131,27 +130,18 @@ amdsmi_status_t AMDSmiDrm::init() { // looking for /sys/class/drm/card0/../renderD* std::string render_name = find_file_in_folder(renderD_folder, regex); - fd = -1; std::string name = "/dev/dri/" + render_name; - if (render_name != "") { - fd = open(name.c_str(), O_RDWR | O_CLOEXEC); - } + auto fd = amdsmi_RAII_FD_handler(name.c_str(), O_RDWR | O_CLOEXEC); amdsmi_bdf_t bdf; - if (fd >= 0) { - auto version = drm_get_version(fd); - if (strcmp("amdgpu", version->name)) { // only amdgpu - close(fd); - fd = -1; - } - if (fd >= 0 && drm_get_device(fd, &device) != 0) { + if (*fd >= 0) { + auto version = drm_get_version(*fd); + if (*fd >= 0 && drm_get_device(*fd, &device) != 0) { drm_free_device(&device); - close(fd); - fd = -1; } ss << __PRETTY_FUNCTION__ << " | " << " render file name: " << name << "\n" - << "; fd: " << std::dec << fd << "\n" + << "; fd: " << std::dec << *fd << "\n" << "; drm version->name: " << version->name << "\n" << "; drm version->date: " << version->date << "\n" << "; drm version_major.version_minor.version_patchlevel: " @@ -174,11 +164,12 @@ amdsmi_status_t AMDSmiDrm::init() { drm_free_version(version); } - drm_fds_.push_back(fd); + drm_fds_.push_back(*fd); drm_paths_.push_back(render_name); // even if fail, still add to prevent mismatch the index - if (fd < 0) { + if (*fd < 0) { drm_bdfs_.push_back(bdf); + drm_free_device(&device); continue; } @@ -215,7 +206,6 @@ amdsmi_status_t AMDSmiDrm::init() { drm_bdfs_.push_back(bdf); drm_free_device(&device); - close(fd); } // cannot find any valid fds. @@ -223,7 +213,6 @@ amdsmi_status_t AMDSmiDrm::init() { drm_bdfs_.clear(); return AMDSMI_STATUS_INIT_ERROR; } - return AMDSMI_STATUS_SUCCESS; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_system.cc b/projects/amdsmi/src/amd_smi/amd_smi_system.cc index 6847f86360..abc1930d66 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_system.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_system.cc @@ -404,6 +404,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() { // we do not need to delete the sockets/processors, clear takes care of this if (!processors_.empty()) {processors_.clear();} if (!sockets_.empty()) {sockets_.clear();} + drm_.cleanup(); init_flag_ &= ~AMDSMI_INIT_AMD_GPUS; rsmi_status_t ret = rsmi_shut_down(); if (ret != RSMI_STATUS_SUCCESS) { diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 95b9083466..1d66c187ff 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -583,31 +583,40 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice* device, char *market_name) { + SMIGPUDEVICE_MUTEX(device->get_mutex()) if (market_name == nullptr || device == nullptr) { return AMDSMI_STATUS_ARG_PTR_NULL; } + // initialize the market_name to empty string + std::string empty = ""; + std::strncpy(market_name, empty.c_str(), AMDSMI_MAX_STRING_LENGTH - 1); std::ostringstream ss; std::string render_name = device->get_gpu_path(); - int fd = -1; std::string path = "/dev/dri/" + render_name; - - if (render_name != "") { - fd = open(path.c_str(), O_RDWR | O_CLOEXEC); - } else { - market_name[0] = '\0'; - close(fd); + if (render_name.empty()) { return AMDSMI_STATUS_NOT_SUPPORTED; } + + auto fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC); ss << __PRETTY_FUNCTION__ << " | Render Name: " - << render_name << "; path: " << path << "; fd: " << fd; + << render_name << "; path: " << path << "; fd: " + << (fd == nullptr ? "nullptr" : std::to_string(*fd)) << "\n"; LOG_DEBUG(ss); + if (!fd) { + ss << __PRETTY_FUNCTION__ << " | Render Name: " + << render_name << "; path: " << path << "; fd: " + << (fd == nullptr ? "nullptr" : std::to_string(*fd)) << "\n" + << "; Returning: " + << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false) << "\n"; + LOG_INFO(ss); + return AMDSMI_STATUS_FILE_ERROR; + } amd::smi::AMDSmiLibraryLoader libdrm_amdgpu_; amdsmi_status_t status = libdrm_amdgpu_.load("libdrm_amdgpu.so"); if (status != AMDSMI_STATUS_SUCCESS) { - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + libdrm_amdgpu_.unload(); return status; } @@ -621,60 +630,46 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice amdgpu_device_deinitialize_t amdgpu_device_deinitialize = nullptr; amdgpu_get_marketing_name_t amdgpu_get_marketing_name = nullptr; - status = libdrm_amdgpu_.load_symbol( - reinterpret_cast(&amdgpu_device_initialize), - "amdgpu_device_initialize"); + status = libdrm_amdgpu_.load_symbol(reinterpret_cast(&amdgpu_device_deinitialize), + "amdgpu_device_deinitialize"); if (status != AMDSMI_STATUS_SUCCESS) { - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + libdrm_amdgpu_.unload(); + return status; + } + + status = libdrm_amdgpu_.load_symbol(reinterpret_cast(&amdgpu_device_initialize), + "amdgpu_device_initialize"); + if (status != AMDSMI_STATUS_SUCCESS) { + libdrm_amdgpu_.unload(); return status; } amdgpu_device_handle device_handle = nullptr; uint32_t major_version, minor_version; - int ret = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle); + int ret = amdgpu_device_initialize(*fd, &major_version, &minor_version, &device_handle); if (ret != 0) { - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + amdgpu_device_deinitialize(device_handle); + libdrm_amdgpu_.unload(); return AMDSMI_STATUS_DRM_ERROR; } - status = libdrm_amdgpu_.load_symbol( - reinterpret_cast( - &amdgpu_get_marketing_name), "amdgpu_get_marketing_name"); + status = libdrm_amdgpu_.load_symbol(reinterpret_cast(&amdgpu_get_marketing_name), + "amdgpu_get_marketing_name"); if (status != AMDSMI_STATUS_SUCCESS) { - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + amdgpu_device_deinitialize(device_handle); + libdrm_amdgpu_.unload(); return status; } - status = libdrm_amdgpu_.load_symbol(reinterpret_cast( - &amdgpu_device_deinitialize), "amdgpu_device_deinitialize"); - if (status != AMDSMI_STATUS_SUCCESS) { - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); - return status; - } - - ret = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle); - if (ret != 0) { - std::string empty = ""; - std::strncpy(market_name, empty.c_str(), AMDSMI_MAX_STRING_LENGTH - 1); - amdgpu_device_deinitialize(device_handle); - close(fd); - return AMDSMI_STATUS_DRM_ERROR; - } - // Get the marketing name using libdrm's API const char *name = amdgpu_get_marketing_name(device_handle); if (name != nullptr) { std::strncpy(market_name, name, AMDSMI_MAX_STRING_LENGTH - 1); market_name[AMDSMI_MAX_STRING_LENGTH - 1] = '\0'; amdgpu_device_deinitialize(device_handle); - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + libdrm_amdgpu_.unload(); ss << __PRETTY_FUNCTION__ << " | path: " << path << "\n" - << " | fd: "<< std::dec << fd << "\n" + << " | fd: "<< std::dec << *fd << "\n" << " | Marketing Name: " << market_name << "\n" << " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, false) << "\n"; @@ -683,10 +678,9 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice } amdgpu_device_deinitialize(device_handle); - close(fd); - libdrm_amdgpu_.AMDSmiLibraryLoader::unload(); + libdrm_amdgpu_.unload(); ss << __PRETTY_FUNCTION__ << " | path: " << path << "\n" - << " | fd: "<< std::dec << fd << "\n" + << " | fd: "<< std::dec << *fd << "\n" << " | Marketing Name: " << market_name << "\n" << " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false) << "\n"; @@ -804,7 +798,6 @@ amdsmi_status_t smi_amdgpu_get_device_index(amdsmi_processor_handle processor_ha << "Returning device_index: " << *device_index << "\nSocket #: " << i << "; Device #: " << j << "; current_device_index #: " << current_device_index << "\n"; - // std::cout << ss.str(); LOG_DEBUG(ss); return AMDSMI_STATUS_SUCCESS; } @@ -913,8 +906,6 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( LOG_DEBUG(ss); for (uint32_t j = 0; j < device_count; j++) { - // std::cout << "current_device_index: " << current_device_index - // << " device_index: " << device_index << std::endl; if (current_device_index == device_index) { *processor_handle = processor_handles[j]; ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_SUCCESS" @@ -924,7 +915,6 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( << "; processor_handle: " << *processor_handle << "; processor_handles[j]: " << processor_handles[j] << "\n"; - // std::cout << ss.str(); LOG_DEBUG(ss); return AMDSMI_STATUS_SUCCESS; } @@ -959,3 +949,58 @@ void amdsmi_wait_for_user_input(void) { } } } + +std::shared_ptr amdsmi_RAII_FD_handler(const std::string& path, int flags) { + static std::mutex fd_mutex; + static std::map> open_files; + static std::ostringstream ss; + + std::lock_guard lock(fd_mutex); + + // Clean up expired entries from the cache + for (auto it = open_files.begin(); it != open_files.end();) { + if (it->second.expired()) { + it = open_files.erase(it); + } else { + ++it; + } + } + + // Try to reuse an existing open FD + auto it = open_files.find(path); + if (it != open_files.end()) { + if (auto existing_fd = it->second.lock()) { + ss <<__PRETTY_FUNCTION__ << " | Reusing FD for path: " << path; + LOG_INFO(ss); + return existing_fd; + } + } + + // Open a new file descriptor + int fd = open(path.c_str(), flags); + if (fd < 0) { + ss << __PRETTY_FUNCTION__ << " | Failed to open file: " << path + << " | Error: " << strerror(errno); + LOG_INFO(ss); + return nullptr; + } + + ss << __PRETTY_FUNCTION__ << " | Opened FD: " << std::to_string(fd) + << " for path: " << path; + LOG_INFO(ss); + + // Create a shared_ptr with a custom deleter to close the FD + auto fd_ptr = std::shared_ptr(new int(fd), [path](int* fd) { + if (fd && *fd >= 0) { + ss << __PRETTY_FUNCTION__ << " | Closing FD: " << std::to_string(*fd) + << " | Path: " << path << std::endl; + LOG_INFO(ss); + close(*fd); + delete fd; + } + }); + + // Store weak_ptr in cache for reuse + open_files[path] = fd_ptr; + return fd_ptr; +} diff --git a/projects/amdsmi/tests/amd_smi_test/functional/computepartition_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/computepartition_read_write.cc index 573ec49772..28dceaf538 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/computepartition_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/computepartition_read_write.cc @@ -273,6 +273,7 @@ static void checkPartitionIdChanges(amdsmi_processor_handle* const processor_han "\"sudo rmmod amdgpu && sudo rmmod ast && sudo modprobe amdgpu\")." "\n\tCPX may not enumerate properly.\n"; } + // amdsmi_wait_for_user_input(); // watch for any errors break; } amdsmi_kfd_info_t kfd_info; @@ -432,7 +433,7 @@ void TestComputePartitionReadWrite::Run(void) { << computePartitionString(updatePartition) << " ===============" << std::endl; } - // waitForUserInput(); // watch for any errors + // amdsmi_wait_for_user_input(); // watch for any errors auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition); IF_VERB(STANDARD) { @@ -463,6 +464,7 @@ void TestComputePartitionReadWrite::Run(void) { << "\n\t Device might be in a static partition mode. " << "With inability to change partition modes." << std::endl; + // amdsmi_wait_for_user_input(); // watch for any errors break; } @@ -491,7 +493,7 @@ void TestComputePartitionReadWrite::Run(void) { static_cast( mapStringToSMIComputePartitionTypes.at( std::string(orig_char_computePartition))); - // waitForUserInput(); // watch for any errors on going back to original partition + // amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition); EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE || ret_set== AMDSMI_STATUS_NO_PERM @@ -510,6 +512,8 @@ void TestComputePartitionReadWrite::Run(void) { // TEST 2: Set/Get Compute Partition (new functionality) initial_num_devices = num_monitor_devs(); + amdsmi_accelerator_partition_type_t primary_partition_type = AMDSMI_ACCELERATOR_PARTITION_INVALID; + uint32_t primary_index = 0; for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { if (dv_ind != 0) { std::cout << "\n"; @@ -518,11 +522,11 @@ void TestComputePartitionReadWrite::Run(void) { std::cout << "\n"; std::cout << "\t**======================================================================\n"; std::cout << "\t**Test #2: Get/Set Compute Partition (new functionality) ===============\n"; - std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind + std::cout << "\t**DEVICE: #" << std::dec << std::setw(2) << std::setfill('0') << dv_ind << " ==========================================================\n"; std::cout << "\t**======================================================================\n"; } - // waitForUserInput(); // watch for any errors + // amdsmi_wait_for_user_input(); // watch for any errors PrintDeviceHeader(processor_handles_[dv_ind]); amdsmi_accelerator_partition_profile_t profile = {}; uint32_t partition_id[8] = {0, 0, 0, 0, 0, 0, 0, 0}; @@ -563,6 +567,12 @@ void TestComputePartitionReadWrite::Run(void) { profile_type_str = "CPX"; } + // save the primary partition type + if (profile.profile_type != AMDSMI_ACCELERATOR_PARTITION_INVALID) { + primary_partition_type = profile.profile_type; + primary_index = dv_ind; + } + std::string partition_id_str = ""; for (int i = 0; i < 8; i++) { partition_id_str += std::to_string(partition_id[i]); @@ -570,7 +580,7 @@ void TestComputePartitionReadWrite::Run(void) { partition_id_str += ", "; } - switch (profile.profile_type) { + switch (primary_partition_type) { case AMDSMI_ACCELERATOR_PARTITION_SPX: EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS); break; @@ -586,7 +596,7 @@ void TestComputePartitionReadWrite::Run(void) { case AMDSMI_ACCELERATOR_PARTITION_CPX: { uint16_t num_xcd; uint32_t max_xcps = 0; - ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd); + ret = amdsmi_get_gpu_xcd_counter(processor_handles_[primary_index], &num_xcd); if (ret == AMDSMI_STATUS_SUCCESS) { max_xcps = static_cast(num_xcd); } @@ -640,7 +650,7 @@ void TestComputePartitionReadWrite::Run(void) { AcceleratorProfileConfig original_profile_config = {}; original_profile_config = getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose); - // waitForUserInput(); // watch for any errors + // amdsmi_wait_for_user_input(); // watch for any errors IF_VERB(STANDARD) { std::cout << "\t**=========================================================\n"; @@ -762,7 +772,7 @@ void TestComputePartitionReadWrite::Run(void) { << profile_config.profiles[config].profile_index << ")" << " ===============" << std::endl; } - // waitForUserInput(); // watch for any errors + // amdsmi_wait_for_user_input(); // watch for any errors auto ret_set = amdsmi_set_gpu_accelerator_partition_profile( processor_handles_[dv_ind], @@ -789,6 +799,7 @@ void TestComputePartitionReadWrite::Run(void) { << "\n\t Device might be in a static partition mode. " << "With inability to change partition modes." << std::endl; + // amdsmi_wait_for_user_input(); // watch for any errors break; } if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { @@ -872,6 +883,7 @@ void TestComputePartitionReadWrite::Run(void) { std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile: " << "Not supported on this machine, skipping remaining tests." << std::endl; } + // amdsmi_wait_for_user_input(); // watch for any errors break; } @@ -911,7 +923,7 @@ void TestComputePartitionReadWrite::Run(void) { std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n"; std::cout << "\t**======================================================================\n"; } - // waitForUserInput(); // watch for any errors on going back to original partition + // amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition // ---------------------------------------------------------// // TEST 3: Check fluctuating # of devices & partition IDs // @@ -925,12 +937,12 @@ void TestComputePartitionReadWrite::Run(void) { std::cout << "\n"; std::cout << "\t**======================================================================\n"; std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n"; - std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind + std::cout << "\t**DEVICE: #" << std::dec << std::setw(2) << std::setfill('0') << dv_ind << " ========================================================\n"; std::cout << "\t**======================================================================\n"; } // Leaving for debug purposes - // waitForUserInput(); // watch for any errors on going back to original partition + // amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition uint32_t device_index = 0; amdsmi_processor_handle p_handle = {}; uint32_t current_num_devices = 0; @@ -1013,6 +1025,7 @@ void TestComputePartitionReadWrite::Run(void) { << "\n\t Device might be in a static partition mode. " << "With inability to change partition modes." << std::endl; + // amdsmi_wait_for_user_input(); // watch for any errors break; } @@ -1039,7 +1052,7 @@ void TestComputePartitionReadWrite::Run(void) { EXPECT_NE(updatePartition, mapStringToSMIComputePartitionTypes.at( std::string(current_char_computePartition))); } - // waitForUserInput(); // watch for any errors on going back to original partition + // amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition } uint32_t device_index3 = 0; @@ -1055,7 +1068,7 @@ void TestComputePartitionReadWrite::Run(void) { std::cout << "\t**ABOUT TO GO BACK TO ORIGINAL PARTITION (" << orig_char_computePartition << ")\n"; } - // waitForUserInput(); // watch for any errors on going back to original partition + // amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition auto ret_set = amdsmi_set_gpu_compute_partition(p_handle3, updatePartition); checkPartitionIdChanges(processor_handles_, dv_ind, std::string(orig_char_computePartition), isVerbose, true); @@ -1076,8 +1089,8 @@ void TestComputePartitionReadWrite::Run(void) { } } IF_VERB(STANDARD) { - std::cout << "\t**Get/Set Test #3 (dev_ind: " - << dv_ind << "): Check fluctuating # of devices & partition IDs ===============\n"; + std::cout << "\t**Get/Set Test #3 (dev_ind: " << std::dec + << dv_ind << "): Check fluctuating # of devices & partition IDs ===============\n"; } } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc index 415cc89960..d409ae279f 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc @@ -120,16 +120,17 @@ void TestErrCntRead::Run(void) { err = amdsmi_get_gpu_ecc_count(processor_handles_[i], static_cast(b), &ec); - if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + if (err == AMDSMI_STATUS_NOT_SUPPORTED || err == AMDSMI_STATUS_FILE_ERROR) { IF_VERB(STANDARD) { std::cout << "\t**Error Count for " << GetBlockNameStr(static_cast(b)) << - ": Not supported for this device" << std::endl; + ": Not supported for this device or error accessing file" << std::endl; } // Verify api support checking functionality is working err = amdsmi_get_gpu_ecc_count(processor_handles_[i], static_cast(b), nullptr); - ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + ASSERT_TRUE(err == AMDSMI_STATUS_NOT_SUPPORTED + || err == AMDSMI_STATUS_FILE_ERROR); } else { CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/amd_smi_test/functional/frequencies_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/frequencies_read_write.cc index f47e63309b..1bd702c497 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/frequencies_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/frequencies_read_write.cc @@ -31,6 +31,7 @@ #include #include "amd_smi/amdsmi.h" +#include "amd_smi/impl/amd_smi_utils.h" #include "frequencies_read_write.h" #include "../test_common.h" @@ -71,6 +72,19 @@ void TestFrequenciesReadWrite::Run(void) { amdsmi_frequencies_t f; uint32_t freq_bitmask; amdsmi_clk_type_t amdsmi_clk; + const std::map clk_type_map = { + {AMDSMI_CLK_TYPE_SYS, "SYS"}, + {AMDSMI_CLK_TYPE_GFX, "GFX"}, + {AMDSMI_CLK_TYPE_DF, "DF"}, + {AMDSMI_CLK_TYPE_DCEF, "DCEF"}, + {AMDSMI_CLK_TYPE_SOC, "SOC"}, + {AMDSMI_CLK_TYPE_MEM, "MEM"}, + {AMDSMI_CLK_TYPE_PCIE, "PCIE"}, + {AMDSMI_CLK_TYPE_VCLK0, "VCLK0"}, + {AMDSMI_CLK_TYPE_VCLK1, "VCLK1"}, + {AMDSMI_CLK_TYPE_DCLK0, "DCLK0"}, + {AMDSMI_CLK_TYPE_DCLK1, "DCLK1"}, + }; TestBase::Run(); if (setup_failed_) { @@ -86,11 +100,18 @@ void TestFrequenciesReadWrite::Run(void) { auto freq_read = [&]() -> bool { // Skip AMDSMI_CLK_TYPE_PCIE, which does not supported in rocm-smi. - std::cout << amdsmi_clk << std::endl; - if (amdsmi_clk == AMDSMI_CLK_TYPE_PCIE) - return false; + if (auto it = clk_type_map.find(amdsmi_clk); it != clk_type_map.end()) { + if (amdsmi_clk == AMDSMI_CLK_TYPE_PCIE) { + return false; // Quietly skip PCIE clock + // Cannot read/write to PCIE clock in driver + } + std::cout << "amdsmi_get_clk_freq(" << it->second << ", f)"; + } + ret = amdsmi_get_clk_freq(processor_handles_[dv_ind], amdsmi_clk, &f); - std::cout << ret << std::endl; + if (auto it = clk_type_map.find(amdsmi_clk); it != clk_type_map.end()) { + std::cout << ": " << smi_amdgpu_get_status_string(ret, false) << std::endl; + } if (ret == AMDSMI_STATUS_NOT_SUPPORTED || ret == AMDSMI_STATUS_NOT_YET_IMPLEMENTED) { diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_busy_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/gpu_busy_read.cc index 56f92a28c6..73837e08e0 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/gpu_busy_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_busy_read.cc @@ -22,11 +22,11 @@ #include #include +#include #include #include -#include #include "amd_smi/amdsmi.h" #include "gpu_busy_read.h" #include "../test_common.h" @@ -63,9 +63,36 @@ void TestGPUBusyRead::Close() { void TestGPUBusyRead::Run(void) { + amdsmi_status_t err; + uint32_t val_ui32; + TestBase::Run(); if (setup_failed_) { std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; return; } + + for (uint32_t x = 0; x < num_iterations(); ++x) { + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + PrintDeviceHeader(processor_handles_[i]); + + err = amdsmi_get_gpu_busy_percent(processor_handles_[i], &val_ui32); + if (err != AMDSMI_STATUS_SUCCESS) { + if (err == AMDSMI_STATUS_FILE_ERROR || err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**GPU Busy Percent: Not supported on this machine" + << std::endl; + } + ASSERT_TRUE(err == AMDSMI_STATUS_FILE_ERROR || err == AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + } + } else { + IF_VERB(STANDARD) { + std::cout << "\t**GPU Busy Percent (Percent Idle):" << std::dec << + val_ui32 << " (" << 100 - val_ui32 << ")" << std::endl; + } + } + } + } } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc index 157c216fcc..31c85f6d15 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -98,7 +98,7 @@ void TestGpuMetricsRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**" << "Not supported on this machine" << std::endl; - return; + continue; } } } else { diff --git a/projects/amdsmi/tests/amd_smi_test/functional/mem_util_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/mem_util_read.cc index ba10947138..8956340e10 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/mem_util_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/mem_util_read.cc @@ -22,15 +22,16 @@ #include #include +#include #include #include #include -#include #include "amd_smi/amdsmi.h" #include "mem_util_read.h" #include "../test_common.h" +#include "amd_smi/impl/amd_smi_utils.h" TestMemUtilRead::TestMemUtilRead() : TestBase() { set_title("Memory Utilization Read Test"); @@ -81,12 +82,14 @@ void TestMemUtilRead::Run(void) { } auto err_chk = [&](const char *str) { + IF_VERB(STANDARD) { + std::cout << "\t** " << str << std::endl; + } if (err != AMDSMI_STATUS_SUCCESS) { - if (err == AMDSMI_STATUS_FILE_ERROR) { - IF_VERB(STANDARD) { - std::cout << "\t** " << str << ": Not supported on this machine" - << std::endl; - } + if (err == AMDSMI_STATUS_FILE_ERROR || + err == AMDSMI_STATUS_NOT_SUPPORTED) { + ASSERT_TRUE(err == AMDSMI_STATUS_NOT_SUPPORTED + || err == AMDSMI_STATUS_FILE_ERROR); } else { CHK_ERR_ASRT(err) } @@ -101,23 +104,32 @@ void TestMemUtilRead::Run(void) { mem_type <= AMDSMI_MEM_TYPE_LAST; ++mem_type) { err = amdsmi_get_gpu_memory_total(processor_handles_[i], static_cast(mem_type), &total); - err_chk("amdsmi_get_gpu_memory_total()"); + smi_amdgpu_get_status_string(err, false); + std::string mem_type_str = + kDevMemoryTypeNameMap.at(static_cast(mem_type)); + std::string input_str = + "amdsmi_get_gpu_memory_total(" + mem_type_str + "): " + + smi_amdgpu_get_status_string(err, false); + err_chk(input_str.c_str()); if (err != AMDSMI_STATUS_SUCCESS) { - return; + continue; } err = amdsmi_get_gpu_memory_usage(processor_handles_[i], static_cast(mem_type), &usage); - err_chk("amdsmi_get_gpu_memory_usage()"); + input_str = + "amdsmi_get_gpu_memory_usage(" + mem_type_str + "): " + + smi_amdgpu_get_status_string(err, false); + err_chk(input_str.c_str()); if (err != AMDSMI_STATUS_SUCCESS) { - return; + continue; } IF_VERB(STANDARD) { std::cout << "\t**" << kDevMemoryTypeNameMap.at(static_cast(mem_type)) << " Calculated Utilization: " << - (static_cast(usage)*100)/static_cast(total) << "% ("<< usage << + (static_cast(usage)*100)/static_cast(total) << "% (" << usage << "/" << total << ")" << std::endl; } } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc index 8d148186c2..fbd76f3e2c 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc @@ -124,6 +124,8 @@ void TestMemoryPartitionReadWrite::Run(void) { std::cout << "\t**=========================================================\n"; } auto initial_num_devices = num_monitor_devs(); + amdsmi_accelerator_partition_type_t primary_partition_type = AMDSMI_ACCELERATOR_PARTITION_INVALID; + uint32_t primary_index = 0; for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { if (dv_ind != 0) { std::cout << "\n"; @@ -168,6 +170,12 @@ void TestMemoryPartitionReadWrite::Run(void) { profile_type_str = "CPX"; } + // save the primary partition type + if (profile.profile_type != AMDSMI_ACCELERATOR_PARTITION_INVALID) { + primary_partition_type = profile.profile_type; + primary_index = dv_ind; + } + std::string partition_id_str = ""; for (int i = 0; i < 8; i++) { partition_id_str += std::to_string(partition_id[i]); @@ -175,7 +183,7 @@ void TestMemoryPartitionReadWrite::Run(void) { partition_id_str += ", "; } - switch (profile.profile_type) { + switch (primary_partition_type) { case AMDSMI_ACCELERATOR_PARTITION_SPX: EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS); break; @@ -191,7 +199,7 @@ void TestMemoryPartitionReadWrite::Run(void) { case AMDSMI_ACCELERATOR_PARTITION_CPX: { uint16_t num_xcd; uint32_t max_xcps = 0; - ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd); + ret = amdsmi_get_gpu_xcd_counter(processor_handles_[primary_index], &num_xcd); if (ret == AMDSMI_STATUS_SUCCESS) { max_xcps = static_cast(num_xcd); } @@ -245,7 +253,7 @@ void TestMemoryPartitionReadWrite::Run(void) { AcceleratorProfileConfig original_profile_config = getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose); orig_dev_config[dv_ind] = original_profile_config; - // waitForUserInput(); // watch for any errors + // amdsmi_wait_for_user_input(); // watch for any errors IF_VERB(STANDARD) { std::cout << "\t**=========================================================\n"; @@ -321,7 +329,7 @@ void TestMemoryPartitionReadWrite::Run(void) { || ret == AMDSMI_STATUS_NOT_SUPPORTED); if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { - std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config: " + std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config(): " << "Not supported on this machine" << std::endl; } continue; @@ -329,6 +337,11 @@ void TestMemoryPartitionReadWrite::Run(void) { } // Run memory partition tests + IF_VERB(STANDARD) { + std::cout << "\t**=========================================================\n"; + std::cout << "\t**Test: Memory Partition Sets =============================\n"; + std::cout << "\t**=========================================================\n"; + } uint32_t current_num_devices = 0; smi_amdgpu_get_device_count(¤t_num_devices); @@ -352,7 +365,7 @@ void TestMemoryPartitionReadWrite::Run(void) { processor_handles_[dv_ind], orig_memory_partition, k255Len); if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { - std::cout << "\t**" << ": " + std::cout << "\t**" << "amdsmi_get_gpu_memory_partition(): " << "Not supported on this machine" << std::endl; } continue; @@ -563,7 +576,7 @@ void TestMemoryPartitionReadWrite::Run(void) { } if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { - std::cout << "\t**" << ": " + std::cout << "\t**" << "amdsmi_set_gpu_memory_partition_mode(): " << "Not supported on this machine" << std::endl; } break; @@ -618,7 +631,8 @@ void TestMemoryPartitionReadWrite::Run(void) { ret = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind], ¤t_memory_config); - CHK_ERR_ASRT(ret) + ASSERT_TRUE((ret == AMDSMI_STATUS_NOT_SUPPORTED) || + (ret == AMDSMI_STATUS_SUCCESS)); IF_VERB(STANDARD) { std::cout << "\t**" << "amdsmi_get_gpu_memory_partition_config(processor_handles_[" << dv_ind @@ -629,6 +643,13 @@ void TestMemoryPartitionReadWrite::Run(void) { << memoryPartitionString(current_memory_config.mp_mode) << std::endl; } + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << "amdsmi_get_gpu_memory_partition_config(): " + << "Not supported on this machine... trying on other devices" << std::endl; + } + continue; + } new_memory_partition = mapStringToRSMIMemoryPartitionTypes.at(orig_memory_partition); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read.cc index 3a0c0e5878..06145429a9 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read.cc @@ -22,11 +22,11 @@ #include #include +#include #include #include -#include #include "amd_smi/amdsmi.h" #include "perf_level_read.h" #include "../test_common.h" @@ -76,10 +76,15 @@ void TestPerfLevelRead::Run(void) { PrintDeviceHeader(processor_handles_[i]); err = amdsmi_get_gpu_perf_level(processor_handles_[i], &pfl); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Performance Level:" << std::dec << (uint32_t)pfl << - std::endl; + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Performance Level: Not Supported" << std::endl; + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Performance Level:" << std::dec << (uint32_t)pfl + << std::endl; + } } // Verify api support checking functionality is working err = amdsmi_get_gpu_perf_level(processor_handles_[i], nullptr); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read_write.cc index e334bc1d6d..dccc1141df 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/perf_level_read_write.cc @@ -22,11 +22,11 @@ #include #include +#include #include #include -#include #include "amd_smi/amdsmi.h" #include "perf_level_read_write.h" #include "../test_common.h" @@ -79,11 +79,17 @@ void TestPerfLevelReadWrite::Run(void) { PrintDeviceHeader(processor_handles_[dv_ind]); ret = amdsmi_get_gpu_perf_level(processor_handles_[dv_ind], &orig_pfl); - CHK_ERR_ASRT(ret) + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**amdsmi_get_gpu_perf_level(): Not supported on this machine" << std::endl; + } + ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED); + continue; + } IF_VERB(STANDARD) { - std::cout << "\t**Original Perf Level:" << - GetPerfLevelStr(orig_pfl) << std::endl; + std::cout << "\t**Original Perf Level:" + << GetPerfLevelStr(orig_pfl) << std::endl; } uint32_t pfl_i = static_cast(AMDSMI_DEV_PERF_LEVEL_FIRST); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc index 4059c78551..419598a7a0 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/power_cap_read_write.cc @@ -22,13 +22,13 @@ #include #include +#include #include #include #include #include -#include #include "amd_smi/amdsmi.h" #include "power_cap_read_write.h" #include "../test_common.h" @@ -83,18 +83,18 @@ void TestPowerCapReadWrite::SetCheckPowerCap(std::string msg, uint32_t dv_ind, u start = clock(); ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, new_cap); end = clock(); - cpu_time_used = ((double) (end - start)) * 1000000UL / CLOCKS_PER_SEC; + cpu_time_used = (static_cast(end - start)) * 1000000UL / CLOCKS_PER_SEC; if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { - std::cout << "\t** Not supported on this machine" << std::endl; + std::cout << "\t**amdsmi_set_power_cap(): Not supported on this machine" << std::endl; } return; } ASSERT_EQ(ret, ret_expected); if (ret == AMDSMI_STATUS_INVAL) { new_cap = curr_cap; - std::cout << "\t** Expected invalid result" << std::endl; + std::cout << "\t**amdsmi_set_power_cap(): Expected invalid result" << std::endl; return; } @@ -134,11 +134,16 @@ void TestPowerCapReadWrite::Run(void) { PrintDeviceHeader(processor_handles_[dv_ind]); amdsmi_power_cap_info_t info; - ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); - CHK_ERR_ASRT(ret) // Verify api support checking functionality is working ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, nullptr); ASSERT_EQ(ret, AMDSMI_STATUS_INVAL); + + ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**amdsmi_get_power_cap_info(): Not supported on this machine" << std::endl; + ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED); + continue; + } min_cap = info.min_power_cap; max_cap = info.max_power_cap; default_cap = info.default_power_cap; @@ -148,15 +153,16 @@ void TestPowerCapReadWrite::Run(void) { IF_VERB(STANDARD) { std::cout << "[Before Set] Default Power Cap: " << default_cap << " uW" << std::endl; std::cout << "[Before Set] Current Power Cap: " << curr_cap << " uW" << std::endl; - std::cout << "[Before Set] Power Cap Range [max to min]: " << max_cap << " uW to " << min_cap << - " uW" << std::endl; + std::cout << "[Before Set] Power Cap Range [max to min]: " + << max_cap << " uW to " << min_cap << " uW" << std::endl; std::cout << "[Before Set] Setting new cap to " << new_cap << "..." << std::endl; } // Check if power cap is within the range // skip the test otherwise if (new_cap < min_cap || new_cap > max_cap) { - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; continue; } ret = AMDSMI_STATUS_SUCCESS; @@ -166,17 +172,18 @@ void TestPowerCapReadWrite::Run(void) { } IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; } - if (min_cap > 0) - { + if (min_cap > 0) { new_cap = min_cap; ret = AMDSMI_STATUS_SUCCESS; SetCheckPowerCap("Setting to Min Power Cap", dv_ind, curr_cap, new_cap, ret); IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; } new_cap = uint64_t(min_cap - 1); @@ -185,7 +192,8 @@ void TestPowerCapReadWrite::Run(void) { if (ret != AMDSMI_STATUS_INVAL) { IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; } } @@ -195,13 +203,13 @@ void TestPowerCapReadWrite::Run(void) { if (ret != AMDSMI_STATUS_INVAL) { IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " + << dv_ind << std::endl; } } - } - else - { - std::cout << "\tPower cap requested is less than or equal to 0, skipping test for " << dv_ind << std::endl; + } else { + std::cout << "\tPower cap requested is less than or equal to 0, skipping test for device #" + << dv_ind << std::endl; } new_cap = max_cap; @@ -209,7 +217,8 @@ void TestPowerCapReadWrite::Run(void) { SetCheckPowerCap("Setting to Max Power Cap", dv_ind, curr_cap, new_cap, ret); IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; } new_cap = uint64_t(max_cap + 1); @@ -218,7 +227,8 @@ void TestPowerCapReadWrite::Run(void) { if (ret != AMDSMI_STATUS_INVAL) { IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) failed to set for " << dv_ind << std::endl; } } @@ -228,7 +238,8 @@ void TestPowerCapReadWrite::Run(void) { if (ret != AMDSMI_STATUS_INVAL) { IF_VERB(STANDARD) { if (!new_cap) - std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl; + std::cout << "\t** Power cap requested (" << new_cap + << " uW) is failed to set for " << dv_ind << std::endl; } } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/power_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/power_read.cc index e898aca165..0b8704e99e 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/power_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/power_read.cc @@ -22,11 +22,11 @@ #include #include +#include #include #include -#include #include "amd_smi/amdsmi.h" #include "power_read.h" #include "../test_common.h" @@ -77,6 +77,11 @@ void TestPowerRead::Run(void) { amdsmi_power_cap_info_t info; err = amdsmi_get_power_cap_info(processor_handles_[i], 0, &info); + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Power Cap not supported on this device." << std::endl; + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + continue; + } CHK_ERR_ASRT(err) IF_VERB(STANDARD) { std::cout << "\t**Current Power Cap: " << info.power_cap << "uW" < #include +#include #include #include #include -#include #include "amd_smi/amdsmi.h" #include "sys_info_read.h" #include "../test_common.h" @@ -118,16 +118,22 @@ void TestSysInfoRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], &val_i32); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32; - std::cout << " (" << std::dec << val_i32 << ")" << std::endl; + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**amdsmi_get_gpu_topo_numa_affinity(): Not supported on this machine" + << std::endl; + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32; + std::cout << " (" << std::dec << val_i32 << ")" << std::endl; + } } + // Verify api support checking functionality is working err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - // vendor_id, unique_id, target_gfx_version amdsmi_asic_info_t asic_info = {}; err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asic_info); diff --git a/projects/amdsmi/tests/amd_smi_test/main.cc b/projects/amdsmi/tests/amd_smi_test/main.cc index 5531991bac..92eb224230 100644 --- a/projects/amdsmi/tests/amd_smi_test/main.cc +++ b/projects/amdsmi/tests/amd_smi_test/main.cc @@ -19,6 +19,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include #include #include @@ -27,7 +28,7 @@ #include "amd_smi/amdsmi.h" #include "rocm_smi/rocm_smi_utils.h" -#include +#include "amd_smi/impl/amd_smi_utils.h" #include "test_common.h" #include "test_base.h" @@ -114,6 +115,7 @@ static void RunGenericTest(TestBase *test) { // RunGenericTest(&); // } TEST(amdsmitstReadOnly, TestVersionRead) { + // amdsmi_wait_for_user_input(); TestVersionRead tst; RunGenericTest(&tst); } diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.cc b/projects/amdsmi/tests/amd_smi_test/test_base.cc index 758c900cd4..424b300d1d 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_base.cc @@ -21,8 +21,8 @@ */ #include + #include -#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_utils.h" @@ -171,7 +171,6 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { amdsmi_status_t err; uint16_t val_ui16; uint32_t val_ui32; - amdsmi_asic_info_t info; err = smi_amdgpu_get_device_count(&val_ui32); CHK_ERR_ASRT(err) @@ -189,16 +188,16 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { std::cout << "\t**Device handle: " << dv_ind << std::endl; } err = amdsmi_get_gpu_id(dv_ind, &val_ui16); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; - } - - err = amdsmi_get_gpu_revision(dv_ind, &val_ui16); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Device Revision ID: 0x" << std::hex << - val_ui16 << std::endl; + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**Device ID: N/A" << std::endl; + } + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; + } } amdsmi_board_info_t board_info; @@ -206,30 +205,82 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { CHK_ERR_ASRT(err) IF_VERB(STANDARD) { std::cout << "\t**Device name: " << board_info.product_name << std::endl; - - err = amdsmi_get_gpu_asic_info(dv_ind, &info); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Device Vendor ID: 0x" << std::hex << - info.vendor_id << std::endl; - } } amdsmi_asic_info_t asic_info; err = amdsmi_get_gpu_asic_info(dv_ind, &asic_info); - CHK_ERR_ASRT(err) + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**ASIC info: " << smi_amdgpu_get_status_string(err, false) << std::endl; + } + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else if (err == AMDSMI_STATUS_FILE_ERROR) { // File error can happen for partition switches, + // if SMI is not re-initialized + IF_VERB(STANDARD) { + std::cout << "\t**ASIC info: " << smi_amdgpu_get_status_string(err, false) << std::endl; + } + ASSERT_EQ(err, AMDSMI_STATUS_FILE_ERROR); + } else { + CHK_ERR_ASRT(err) + } + + // Print everything we can get from the ASIC info IF_VERB(STANDARD) { std::cout << "\t**Market name: " << asic_info.market_name << std::endl; std::cout << "\t**ASIC serial: 0x" << std::hex << asic_info.asic_serial << std::endl; std::cout << "\t**Target GFX Version: gfx" << asic_info.target_graphics_version << std::endl; + std::cout << "\t**Device ID: 0x" << std::hex << std::setfill('0') << std::setw(4) + << asic_info.device_id << std::endl; + if (checkIfMaxValue(asic_info.num_of_compute_units)) { + std::cout << "\t**Num of Compute Units: N/A" << std::endl; + } else { + std::cout << "\t**Num of Compute Units: " << std::dec << asic_info.num_of_compute_units + << std::endl; + } + if (checkIfMaxValue(asic_info.oam_id)) { + std::cout << "\t**OAM ID: N/A" << std::endl; + } else { + std::cout << "\t**OAM ID: " << std::dec << asic_info.oam_id << std::endl; + } + std::cout << "\t**Revision ID: 0x" << std::hex << std::setfill('0') << std::setw(2) + << asic_info.rev_id << std::endl; + if (checkIfMaxValue(asic_info.subvendor_id)) { + std::cout << "\t**Subvendor ID: N/A" << std::endl; + } else { + std::cout << "\t**Subvendor ID: 0x" << std::hex << std::setfill('0') << std::setw(4) + << asic_info.subvendor_id << std::endl; + } + std::cout << "\t**Vendor ID: 0x" << std::hex << std::setfill('0') << std::setw(4) + << asic_info.vendor_id << std::endl; + std::cout << "\t**Vendor name: " << asic_info.vendor_name + << std::endl; + } + + err = amdsmi_get_gpu_revision(dv_ind, &val_ui16); + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**Device Revision ID: N/A" << std::endl; + } + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Revision ID: 0x" << std::hex << std::setfill('0') << std::setw(2) + << val_ui16 << std::endl; + } } err = amdsmi_get_gpu_subsystem_id(dv_ind, &val_ui16); - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Subsystem ID: 0x" << std::hex << val_ui16 << std::endl; - std::cout << "\t**Subsystem Vendor ID: 0x" << std::hex - << info.subvendor_id << std::endl; + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**Subsystem ID: N/A" << std::endl; + } + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Subsystem ID: 0x" << std::hex << std::setfill('0') << std::setw(4) + << val_ui16 << std::endl; + } } std::cout << std::setbase(10); @@ -349,22 +400,6 @@ TestBase::AcceleratorProfileConfig TestBase::getAvailableProfileConfigs( return profile_config; } -void TestBase::waitForUserInput() { - for (;;) { - std::cout << "\n\t**Press any key to continue**" << std::endl; - int input = std::cin.get(); - if (input == EOF) { - std::cout << "EOF detected. Exiting." << std::endl; - return; - } - char input_char = static_cast(input); - std::cout << "User entered: " << input_char << std::endl; - if (input_char == '\n') { - return; - } - } -} - uint32_t TestBase::promptNumDevicesToTest(uint32_t current_num_devices) { uint32_t return_value = 0; std::cout << "**How many devices would you like to test? (0 to skip): "; diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.h b/projects/amdsmi/tests/amd_smi_test/test_base.h index a1b186d4b6..4cf5f727d5 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.h +++ b/projects/amdsmi/tests/amd_smi_test/test_base.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "amd_smi/amdsmi.h" // The max devices can be monitored @@ -133,12 +134,21 @@ class TestBase { amdsmi_accelerator_partition_profile_t current_profile, amdsmi_accelerator_partition_profile_config_t config, bool isVerbose); - void waitForUserInput(); uint32_t promptNumDevicesToTest(uint32_t current_num_devices); std::string getResourceType(amdsmi_accelerator_partition_resource_type_t resource_type); + template + bool checkIfMaxValue(T value) { + T max_value = std::numeric_limits::max(); + if (value == max_value) { + return true; + } else { + return false; + } + } + protected: void MakeHeaderStr(const char *inStr, std::string *outStr) const; void PrintDeviceHeader(amdsmi_processor_handle dv_ind); @@ -163,9 +173,16 @@ class TestBase { // Macros to be used within TestBase classes #define CHK_ERR_ASRT(RET) { \ - if (dont_fail() && ((RET) != AMDSMI_STATUS_SUCCESS)) { \ + if ((RET) != AMDSMI_STATUS_SUCCESS) { \ std::cout << std::endl << "\t===> TEST FAILURE." << std::endl; \ - DISPLAY_AMDSMI_ERR(RET); \ + const char *err_str; \ + std::cout << "\t===> ERROR: AMDSMI call returned " << (RET) << std::endl; \ + amdsmi_status_code_to_string((RET), &err_str); \ + std::cout << "\t===> (" << err_str << ")" << std::endl; \ + std::cout << "\t===> at " << __FILE__ << ":" << std::dec << __LINE__ << \ + std::endl; \ + } \ + if (dont_fail() && ((RET) != AMDSMI_STATUS_SUCCESS)) { \ std::cout << \ "\t===> Abort is over-ridden due to dont_fail command line option." \ << std::endl; \