diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a0ff6d473..f09568d169 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,28 +8,359 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Changes +- **Moved python tests directory path install location**. + - `/opt//share/amd_smi/pytest/..` to `/opt//share/amd_smi/tests/python_unittest/..` + - On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed. + - Removed pytest dependency, our python testing now only depends on the unittest framework. + +- **Added more supported utilization count types to `amdsmi_get_utilization_count()`**. + +- **Added `amd-smi set -L/--clk-limit ...` command**. + - Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency. + - **Added Pytest functionality to test amdsmi API calls in Python**. - **Changed the `power` parameter in `amdsmi_get_energy_count()` to `energy_accumulator`**. -Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. + - Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. -- **Added GPU memory overdrive percentage to `amd-smi metric -o`**. -Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. +- **Added GPU memory overdrive percentage to `amd-smi metric -o`**. + - Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. -- **Added Subsystem Device ID to `amd-smi static --asic`**. -No underlying changes to amdsmi_get_gpu_asic_info +- **Added retrieving connection type and P2P capabilities between two GPUs**. + - Added `amdsmi_topo_get_p2p_status` function to amd-smi C and Python Libraries. + - Added retrieving P2P link capabilities to CLI `amd-smi topology`. + +```shell +$ amd-smi topology -h +usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b] + +If no GPU is specified, returns information for all GPUs on the system. +If no topology argument is provided all topology information will be displayed. + +Topology arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:0c:00.0 | UUID: + ID: 1 | BDF: 0000:22:00.0 | UUID: + ID: 2 | BDF: 0000:38:00.0 | UUID: + ID: 3 | BDF: 0000:5c:00.0 | UUID: + ID: 4 | BDF: 0000:9f:00.0 | UUID: + ID: 5 | BDF: 0000:af:00.0 | UUID: + ID: 6 | BDF: 0000:bf:00.0 | UUID: + ID: 7 | BDF: 0000:df:00.0 | UUID: + all | Selects all devices + + + -a, --access Displays link accessibility between GPUs + -w, --weight Displays relative weight between GPUs + -o, --hops Displays the number of hops between GPUs + -t, --link-type Displays the link type between GPUs + -b, --numa-bw Display max and min bandwidth between nodes + -c, --coherent Display cache coherant (or non-coherant) link capability between nodes + -n, --atomics Display 32 and 64-bit atomic io link capability between nodes + -d, --dma Display P2P direct memory access (DMA) link capability between nodes + -z, --bi-dir Display P2P bi-directional link capability between nodes + + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +```shell +$ amd-smi topology -cndz +CACHE COHERANCY TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF C NC NC C C C NC +0000:22:00.0 C SELF NC C C C NC C +0000:38:00.0 NC NC SELF C C NC C NC +0000:5c:00.0 NC C C SELF NC C NC NC +0000:9f:00.0 C C C NC SELF NC NC C +0000:af:00.0 C C NC C NC SELF C C +0000:bf:00.0 C NC C NC NC C SELF NC +0000:df:00.0 NC C NC NC C C NC SELF + +ATOMICS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF 64,32 64,32 64 32 32 N/A 64,32 +0000:22:00.0 64,32 SELF 64 32 32 N/A 64,32 64,32 +0000:38:00.0 64,32 64 SELF 32 N/A 64,32 64,32 64,32 +0000:5c:00.0 64 32 32 SELF 64,32 64,32 64,32 32 +0000:9f:00.0 32 32 N/A 64,32 SELF 64,32 32 32 +0000:af:00.0 32 N/A 64,32 64,32 64,32 SELF 32 N/A +0000:bf:00.0 N/A 64,32 64,32 64,32 32 32 SELF 64,32 +0000:df:00.0 64,32 64,32 64,32 32 32 N/A 64,32 SELF + +DMA TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF T T F F T F T +0000:22:00.0 T SELF F F T F T T +0000:38:00.0 T F SELF T F T T T +0000:5c:00.0 F F T SELF T T T F +0000:9f:00.0 F T F T SELF T F F +0000:af:00.0 T F T T T SELF F T +0000:bf:00.0 F T T T F F SELF F +0000:df:00.0 T T T F F T F SELF + +BI-DIRECTIONAL TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF T T F F T F T +0000:22:00.0 T SELF F F T F T T +0000:38:00.0 T F SELF T F T T T +0000:5c:00.0 F F T SELF T T T F +0000:9f:00.0 F T F T SELF T F F +0000:af:00.0 T F T T T SELF F T +0000:bf:00.0 F T T T F F SELF F +0000:df:00.0 T T T F F T F SELF + + +Legend: + SELF = Current GPU + ENABLED / DISABLED = Link is enabled or disabled + N/A = Not supported + T/F = True / False + C/NC = Coherant / Non-Coherant io links + 64,32 = 64 bit and 32 bit atomic support + - +``` + +- **Created new amdsmi_kfd_info_t and added information under `amd-smi list`**. + - Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers were added in to a new `amdsmi_kfd_info_t` which gets populated via the API `amdsmi_get_gpu_kfd_info`. + - This info has been added to the `amd-smi list`. + - These new fields are only available for BM/Guest Linux devices at this time. + +```C +typedef struct { + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[13]; +} amdsmi_kfd_info_t; +``` + +```shell +$ amd-smi list +GPU: 0 + BDF: 0000:23:00.0 + UUID: + KFD_ID: 45412 + NODE_ID: 1 + PARTITION_ID: 0 + +GPU: 1 + BDF: 0000:26:00.0 + UUID: + KFD_ID: 59881 + NODE_ID: 2 + PARTITION_ID: 0 +``` + +- **Added Subsystem Device ID to `amd-smi static --asic`**. + - No underlying changes to amdsmi_get_gpu_asic_info + +```shell +$ amd-smi static --asic +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +- **Added Target_Graphics_Version to `amd-smi static --asic` and `amdsmi_get_gpu_asic_info()`**. + +```C +typedef struct { + char market_name[AMDSMI_256_LENGTH]; + uint32_t vendor_id; //< Use 32 bit to be compatible with other platform. + char vendor_name[AMDSMI_MAX_STRING_LENGTH]; + uint32_t subvendor_id; //< The subsystem vendor id + uint64_t device_id; //< The device id of a GPU + uint32_t rev_id; + char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; + uint32_t oam_id; //< 0xFFFF if not supported + uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported + uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t reserved[15]; +} amdsmi_asic_info_t; +``` + +```shell +$ amd-smi static --asic +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +- **Udpated Partition APIs and struct information and added and partition_id to `amd-smi static --partition` & `amd-smi list`**. + - As part of an overhaul to partition information, some partition information will be made available in the `amdsmi_accelerator_partition_profile_t`. + - This struct will be filled out by a new API, `amdsmi_get_gpu_accelerator_partition_profile()`. + - Future data from these APIs wil will eventually get added to `static --partition`. + +```C +#define AMDSMI_MAX_ACCELERATOR_PROFILE 32 +#define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 +#define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 + +/** + * @brief Accelerator Partition. This enum is used to identify + * various accelerator partitioning settings. + */ +typedef enum { + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory +} amdsmi_accelerator_partition_type_t; + +/** + * @brief Possible Memory Partition Modes. + * This union is used to identify various memory partitioning settings. + */ +typedef union { + struct { + uint32_t nps1_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps2_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps4_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps8_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t reserved :28; + } amdsmi_nps_flags_t; + + uint32_t nps_cap_mask; +} amdsmi_nps_caps_t; + + +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, length of resources array + uint32_t profile_index; + amdsmi_nps_caps_t memory_caps; // Possible memory partition capabilities + uint32_t num_resources; // length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint64_t reserved[6]; +} amdsmi_accelerator_partition_profile_t; +``` + +```shell +$ amd-smi static --partition +GPU: 0 + PARTITION: + COMPUTE_PARTITION: CPX + MEMORY_PARTITION: NPS4 + PARTITION_ID: 0 + +$ amd-smi list +GPU: 0 + BDF: 0000:23:00.0 + UUID: + KFD_ID: 45412 + NODE_ID: 1 + PARTITION_ID: 0 + +GPU: 1 + BDF: 0000:26:00.0 + UUID: + KFD_ID: 59881 + NODE_ID: 2 + PARTITION_ID: 0 +``` ### Removals -- N/A +- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. + - This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) ### Optimizations -- N/A +- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**. + - With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie` ### Resolved issues -- N/A +- **Improved Offline install process & lowered dependency for PyYAML**. + +- **Fixed CPX not showing total number of logical GPUs**. + - Updates were made to `amdsmi_init()` and `amdsmi_get_gpu_bdf_id(..)`. In order to display all logical devices, we needed a way to provide order to GPU's enumerated. This was done by adding a partition_id within the BDF optional pci_id bits. + - Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `amdsmi_get_gpu_bdf_id(..)`, we provided this fallback to properly retreive partition ID. We +plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description. + + - bits [63:32] = domain + - bits [31:28] or bits [2:0] = partition id + - bits [27:16] = reserved + - bits [15:8] = Bus + - bits [7:3] = Device + - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + +Previously in non-SPX modes (ex. CPX/TPX/DPX/etc) some MI3x ASICs would not report all logical GPU devices within AMD SMI. + +```shell +$ amd-smi monitor -p -t -v +GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL + 0 248 W 55 °C 48 °C 283 MB 196300 MB + 1 247 W 55 °C 48 °C 283 MB 196300 MB + 2 247 W 55 °C 48 °C 283 MB 196300 MB + 3 247 W 55 °C 48 °C 283 MB 196300 MB + 4 221 W 50 °C 42 °C 283 MB 196300 MB + 5 221 W 50 °C 42 °C 283 MB 196300 MB + 6 222 W 50 °C 42 °C 283 MB 196300 MB + 7 221 W 50 °C 42 °C 283 MB 196300 MB + 8 239 W 53 °C 46 °C 283 MB 196300 MB + 9 239 W 53 °C 46 °C 283 MB 196300 MB + 10 239 W 53 °C 46 °C 283 MB 196300 MB + 11 239 W 53 °C 46 °C 283 MB 196300 MB + 12 219 W 51 °C 48 °C 283 MB 196300 MB + 13 219 W 51 °C 48 °C 283 MB 196300 MB + 14 219 W 51 °C 48 °C 283 MB 196300 MB + 15 219 W 51 °C 48 °C 283 MB 196300 MB + 16 222 W 51 °C 47 °C 283 MB 196300 MB + 17 222 W 51 °C 47 °C 283 MB 196300 MB + 18 222 W 51 °C 47 °C 283 MB 196300 MB + 19 222 W 51 °C 48 °C 283 MB 196300 MB + 20 241 W 55 °C 48 °C 283 MB 196300 MB + 21 241 W 55 °C 48 °C 283 MB 196300 MB + 22 241 W 55 °C 48 °C 283 MB 196300 MB + 23 240 W 55 °C 48 °C 283 MB 196300 MB + 24 211 W 51 °C 45 °C 283 MB 196300 MB + 25 211 W 51 °C 45 °C 283 MB 196300 MB + 26 211 W 51 °C 45 °C 283 MB 196300 MB + 27 211 W 51 °C 45 °C 283 MB 196300 MB + 28 227 W 51 °C 49 °C 283 MB 196300 MB + 29 227 W 51 °C 49 °C 283 MB 196300 MB + 30 227 W 51 °C 49 °C 283 MB 196300 MB + 31 227 W 51 °C 49 °C 283 MB 196300 MB +``` + +- **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**. + +- **`amd-smi static --partition` will have updates with additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()`**. ### Known issues @@ -672,7 +1003,7 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. -- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**. +- **Fix for `amd-smi static --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**. Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). @@ -707,7 +1038,7 @@ Use the watch arguments to run continuously Monitor Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 4eff74a0-0000-1000-802d-1d762a397f73 + ID: 0 | BDF: 0000:01:00.0 | UUID: all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100755 new mode 100644 index 78098d2298..b5c692cc7b --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ find_program(GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver") -get_package_version_number("24.6.3" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("24.6.5" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") @@ -185,12 +185,12 @@ if(BUILD_TESTS) set(TESTS_COMPONENT "tests") #add_subdirectory("tests/rocm_smi_test") add_subdirectory("tests/amd_smi_test") + add_subdirectory("tests/python_unittest") endif() # python interface, CLI, and py-test depend on shared libraries if(BUILD_SHARED_LIBS) add_subdirectory("py-interface") - add_subdirectory("pytest") if(BUILD_CLI) add_subdirectory("amdsmi_cli") endif() @@ -250,9 +250,12 @@ install( DESTINATION share/doc/${AMD_SMI} COMPONENT dev) +# Make for goamdsmi_shim library +add_subdirectory(goamdsmi_shim) + #Debian package specific variables set(CPACK_DEBIAN_PACKAGE_PROVIDES "amd-smi") -set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "python3-argcomplete, libdrm-dev, python3-yaml") +set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "python3-argcomplete, libdrm-dev, python3-PyYAML") set(CPACK_DEBIAN_ASAN_PACKAGE_RECOMMENDS ${CPACK_DEBIAN_PACKAGE_RECOMMENDS}) set(CPACK_DEBIAN_DEV_PACKAGE_RECOMMENDS ${CPACK_DEBIAN_PACKAGE_RECOMMENDS}) set(CPACK_DEBIAN_ASAN_PACKAGE_PROVIDES "${AMD_SMI_PACKAGE}-asan") diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index 474cebc1b8..8312a77830 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -133,6 +133,7 @@ do_install_amdsmi_python_lib() { local PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES="$PIP_BREAK_SYSTEM_PACKAGES" export PIP_BREAK_SYSTEM_PACKAGES=1 + # Remove old python library local pip_list_output pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) @@ -148,7 +149,7 @@ do_install_amdsmi_python_lib() { return fi - # upgrade pip if it's an ancient version + # upgrade pip if it's an ancient version, typically should not execute # otherwise the amdsmi install will fail local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') @@ -158,10 +159,9 @@ do_install_amdsmi_python_lib() { fi unset pip_version - # Check PyYAML dependency - local pyyaml_version - pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') - if [[ "$pyyaml_version" -lt 5 ]]; then + # Check PyYAML dependency, typically should not execute + pyyaml_version=$(python3 -m pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 3 ]]; then echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed fi @@ -169,7 +169,7 @@ do_install_amdsmi_python_lib() { # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ - python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check + python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check --no-build-isolation export PIP_ROOT_USER_ACTION="$PREVIOUS_PIP_ROOT_USER_ACTION" export PIP_BREAK_SYSTEM_PACKAGES="$PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES" @@ -188,22 +188,10 @@ do_install_amdsmi_python_lib() { fi } -do_install_amdsmi_pytest() { - echo -n "Installing pytest... " - pip install -U pytest >/dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "[WARNING] Detected pytest could not be installed. Running pytest may not work as documented." - else - echo -n "[SUCCESS]" - echo "" - fi - return -} case "$1" in ( configure ) do_install_amdsmi_python_lib - do_install_amdsmi_pytest do_ldconfig do_updatepciids do_configureLogrotate || exit 0 diff --git a/DEBIAN/prerm.in b/DEBIAN/prerm.in index 28af94f143..3decc72093 100755 --- a/DEBIAN/prerm.in +++ b/DEBIAN/prerm.in @@ -1,5 +1,7 @@ #!/bin/bash + +# Other prerm actions rm_ldconfig() { # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build if [ "@ENABLE_LDCONFIG@" == "ON" ]; then @@ -8,6 +10,7 @@ rm_ldconfig() { fi } + rm_leftovers() { # remove pyc files generated by python rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amdsmi_cli/__pycache__" @@ -27,6 +30,15 @@ rm_logFolder() { rm -rf /var/log/amd_smi_lib } + +rm_rocm_tests_dir(){ + if [ -d "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" ]; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" + echo "Removed ROCm tests directory." + fi +} + + return_logrotateToOrigConfig() { local logrotateConfFile=/etc/logrotate.d/amd_smi.conf if [ -f $logrotateConfFile ]; then @@ -86,6 +98,7 @@ case "$1" in rm_ldconfig rm_leftovers rm_logFolder + rm_rocm_tests_dir return_logrotateToOrigConfig ;; ( purge ) diff --git a/README.md b/README.md old mode 100755 new mode 100644 diff --git a/RPM/post.in b/RPM/post.in index 78c71864ed..fd182ef414 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -132,6 +132,7 @@ do_install_amdsmi_python_lib() { local PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES="$PIP_BREAK_SYSTEM_PACKAGES" export PIP_BREAK_SYSTEM_PACKAGES=1 + # Remove old python library local pip_list_output pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) @@ -148,7 +149,7 @@ do_install_amdsmi_python_lib() { fi # upgrade pip if it's an ancient version - # otherwise the amdsmi install will fail + # otherwise amdsmi install will fail local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') if [[ "$pip_version" -lt 19 ]]; then @@ -157,10 +158,10 @@ do_install_amdsmi_python_lib() { fi unset pip_version - # Check PyYAML dependency + # Check PyYAML dependency, typically should not execute local pyyaml_version - pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') - if [[ "$pyyaml_version" -lt 5 ]]; then + pyyaml_version=$(python3 -m pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 3 ]]; then echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed fi @@ -168,7 +169,7 @@ do_install_amdsmi_python_lib() { # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ - python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check + python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check --no-build-isolation export PIP_ROOT_USER_ACTION="$PREVIOUS_PIP_ROOT_USER_ACTION" export PIP_BREAK_SYSTEM_PACKAGES="$PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES" @@ -187,22 +188,10 @@ do_install_amdsmi_python_lib() { fi } -do_install_amdsmi_pytest() { - echo -n "Installing pytest... " - pip install -U pytest >/dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "[WARNING] Detected pytest could not be installed. Running pytest may not work as documented." - else - echo -n "[SUCCESS]" - echo "" - fi - return -} # post install or upgrade, $i is 1 or 2 -> do these actions if [ "$1" -ge 1 ]; then do_install_amdsmi_python_lib - do_install_amdsmi_pytest do_ldconfig do_updatepciids do_configureLogrotate || exit 0 diff --git a/RPM/preun.in b/RPM/preun.in index bc2376161c..43e58c93f3 100755 --- a/RPM/preun.in +++ b/RPM/preun.in @@ -15,10 +15,20 @@ rm_leftovers() { fi } + rm_logFolder() { rm -rf /var/log/amd_smi_lib } + +rm_rocm_tests_dir(){ + if [ -d "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" ]; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" + echo "Removed ROCm tests directory." + fi +} + + return_logrotateToOrigConfig() { local logrotateConfFile=/etc/logrotate.d/amd_smi.conf if [ -f $logrotateConfFile ]; then @@ -77,5 +87,6 @@ if [ "$1" -le 1 ]; then rm_python_lib rm_leftovers rm_logFolder + rm_rocm_tests_dir return_logrotateToOrigConfig fi diff --git a/amdsmi_cli/BDF.py b/amdsmi_cli/BDF.py index f9ffe338e5..7f7ce37cab 100644 --- a/amdsmi_cli/BDF.py +++ b/amdsmi_cli/BDF.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -123,7 +123,7 @@ class BDF(): """Overrided the 'in' comparator in python""" passed_bdf = str(BDF(passed_bdf)) - bdf_regex = "(?:[0-6]?[0-9a-fA-F]{1,4}:)?[0-2]?[0-9a-fA-F]{1,2}:[0-9a-fA-F]{1,2}\.[0-7]" + bdf_regex = "(?:[0-6]?[0-9a-fA-F]{1,4}:)?[0-2]?[0-9a-fA-F]{1,2}:[0-9a-fA-F]{1,2}\\.[0-7]" for match in re.findall(bdf_regex, passed_bdf): if self == match: return True diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 378755bae7..ab95062473 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -81,7 +81,7 @@ AMD-SMI reports the version and current platform detected when running the comma ~$ amd-smi usage: amd-smi [-h] ... -AMD System Management Interface | Version: 24.6.3.0 | ROCm version: 6.2.1 | Platform: Linux Baremetal +AMD System Management Interface | Version: 24.6.5.0 | ROCm version: 6.2.2 | Platform: Linux Baremetal options: -h, --help show this help message and exit @@ -1225,4 +1225,4 @@ The information contained herein is for informational purposes only, and is subj AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -Copyright (c) 2014-2023 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2014-2024 Advanced Micro Devices, Inc. All rights reserved. diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py index 0cf7aaa80b..1e61fa44f1 100755 --- a/amdsmi_cli/amdsmi_cli.py +++ b/amdsmi_cli/amdsmi_cli.py @@ -2,7 +2,7 @@ # PYTHON_ARGCOMPLETE_OK # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in diff --git a/amdsmi_cli/amdsmi_cli_exceptions.py b/amdsmi_cli/amdsmi_cli_exceptions.py index fe6cc79ff9..1fcf1c5677 100644 --- a/amdsmi_cli/amdsmi_cli_exceptions.py +++ b/amdsmi_cli/amdsmi_cli_exceptions.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index b955f8b4ab..6bd534413f 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -157,6 +157,9 @@ class AMDSMICommands(): args.gpu = device_handle + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + try: bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: @@ -167,13 +170,34 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: uuid = e.get_error_info() + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + kfd_id = kfd_info['kfd_id'] + node_id = kfd_info['node_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + kfd_id = node_id = "N/A" + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + partition_info = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) + partition_id = partition_info['partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + # CSV format is intentionally aligned with Host if self.logger.is_csv_format(): self.logger.store_output(args.gpu, 'gpu_bdf', bdf) self.logger.store_output(args.gpu, 'gpu_uuid', uuid) + self.logger.store_output(args.gpu, 'kfd_id', kfd_id) + self.logger.store_output(args.gpu, 'node_id', node_id) + self.logger.store_output(args.gpu, 'partition_id', partition_id) else: self.logger.store_output(args.gpu, 'bdf', bdf) self.logger.store_output(args.gpu, 'uuid', uuid) + self.logger.store_output(args.gpu, 'kfd_id', kfd_id) + self.logger.store_output(args.gpu, 'node_id', node_id) + self.logger.store_output(args.gpu, 'partition_id', partition_id) if multiple_devices: self.logger.store_multiple_device_output() @@ -354,29 +378,34 @@ class AMDSMICommands(): # Populate static dictionary for each enabled argument static_dict = {} if args.asic: + asic_dict = { + "market_name" : "N/A", + "vendor_id" : "N/A", + "vendor_name" : "N/A", + "subvendor_id" : "N/A", + "device_id" : "N/A", + "subsystem_id" : "N/A", + "rev_id" : "N/A", + "asic_serial" : "N/A", + "oam_id" : "N/A", + "num_compute_units" : "N/A", + "target_graphics_version" : "N/A" + } + try: asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu) - static_dict["asic"] = asic_info + for key, value in asic_info.items(): + asic_dict[key] = value except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["asic"] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) -# static["asic"] = "N/A" try: subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu) - if static_dict["asic"] != "N/A": - # Reorder asic to include subsystem_id after device_id - static_dict["asic"]["subsystem_id"] = subsystem_id - static_dict["asic"]["rev_id"] = static_dict["asic"].pop("rev_id") - static_dict["asic"]["asic_serial"] = static_dict["asic"].pop("asic_serial") - static_dict["asic"]["oam_id"] = static_dict["asic"].pop("oam_id") - static_dict["asic"]["num_compute_units"] = static_dict["asic"].pop("num_compute_units") - else: - static_dict["asic"]["subsystem_id"] = subsystem_id + asic_dict["subsystem_id"] = subsystem_id except amdsmi_exception.AmdSmiLibraryException as e: - if static_dict["asic"] != "N/A": - static_dict["asic"]["subsystem_id"] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['asic'] = asic_dict if args.bus: bus_info = { 'bdf': "N/A", @@ -658,8 +687,16 @@ class AMDSMICommands(): memory_partition = "N/A" logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + partition_info = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) + partition_id = partition_info['partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['partition'] = {"compute_partition": compute_partition, - "memory_partition": memory_partition} + "memory_partition": memory_partition, + "partition_id": partition_id} if 'soc_pstate' in current_platform_args: if args.soc_pstate: try: @@ -1299,13 +1336,19 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - # Put the metrics table in the debug logs - try: - gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) - logging.debug("GPU Metrics table for %s | %s", gpu_id, gpu_metric_str) - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + if args.loglevel == "DEBUG": + try: + # Get GPU Metrics table version + gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu) + gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) + logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) + + # Get GPU Metrics table + gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) + logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_str) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Unabled to load GPU Metrics table for %s | %s", gpu_id, e.err_info) logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}") logging.debug(f"Args: {current_platform_args}") @@ -1319,6 +1362,88 @@ class AMDSMICommands(): # Add timestamp and store values for specified arguments values_dict = {} + # Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth + if "pcie" in current_platform_args: + if args.pcie: + pcie_dict = {"width": "N/A", + "speed": "N/A", + "bandwidth": "N/A", + "replay_count" : "N/A", + "l0_to_recovery_count" : "N/A", + "replay_roll_over_count" : "N/A", + "nak_sent_count" : "N/A", + "nak_received_count" : "N/A", + "current_bandwidth_sent": "N/A", + "current_bandwidth_received": "N/A", + "max_packet_size": "N/A"} + + try: + pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) + + pcie_dict['width'] = pcie_metric['pcie_width'] + + if pcie_metric['pcie_speed'] != "N/A": + if pcie_metric['pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) + else: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) + pcie_dict['speed'] = pcie_speed_GTs_value + + pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] + pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] + pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] + pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] + pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] + pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] + + pcie_speed_unit = 'GT/s' + pcie_bw_unit = 'Mb/s' + if self.logger.is_human_readable_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" + if self.logger.is_json_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = {"value" : pcie_dict['speed'], + "unit" : pcie_speed_unit} + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], + "unit" : pcie_bw_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) + sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] + received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] + + bw_unit = "Mb/s" + packet_size_unit = "B" + if sent > 0: + sent = sent // 1024 // 1024 + if received > 0: + received = received // 1024 // 1024 + + if self.logger.is_human_readable_format(): + sent = f"{sent} {bw_unit}" + received = f"{received} {bw_unit}" + pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" + if self.logger.is_json_format(): + sent = {"value" : sent, + "unit" : bw_unit} + received = {"value" : received, + "unit" : bw_unit} + pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], + "unit" : packet_size_unit} + + pcie_dict['current_bandwidth_sent'] = sent + pcie_dict['current_bandwidth_received'] = received + pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) + if "usage" in current_platform_args: if args.usage: try: @@ -1648,89 +1773,12 @@ class AMDSMICommands(): "unit" : temp_unit_json} values_dict['temperature'] = temperatures + + # Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first if "pcie" in current_platform_args: if args.pcie: - pcie_dict = {"width": "N/A", - "speed": "N/A", - "bandwidth": "N/A", - "replay_count" : "N/A", - "l0_to_recovery_count" : "N/A", - "replay_roll_over_count" : "N/A", - "nak_sent_count" : "N/A", - "nak_received_count" : "N/A", - "current_bandwidth_sent": "N/A", - "current_bandwidth_received": "N/A", - "max_packet_size": "N/A"} - - try: - pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] - logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) - - pcie_dict['width'] = pcie_metric['pcie_width'] - - if pcie_metric['pcie_speed'] != "N/A": - if pcie_metric['pcie_speed'] % 1000 != 0: - pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) - else: - pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) - pcie_dict['speed'] = pcie_speed_GTs_value - - pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] - pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] - pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] - pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] - pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] - pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] - - pcie_speed_unit = 'GT/s' - pcie_bw_unit = 'Mb/s' - if self.logger.is_human_readable_format(): - if pcie_dict['speed'] != "N/A": - pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" - if pcie_dict['bandwidth'] != "N/A": - pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" - if self.logger.is_json_format(): - if pcie_dict['speed'] != "N/A": - pcie_dict['speed'] = {"value" : pcie_dict['speed'], - "unit" : pcie_speed_unit} - if pcie_dict['bandwidth'] != "N/A": - pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], - "unit" : pcie_bw_unit} - - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) - sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] - received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] - - bw_unit = "Mb/s" - packet_size_unit = "B" - if sent > 0: - sent = sent // 1024 // 1024 - if received > 0: - received = received // 1024 // 1024 - - if self.logger.is_human_readable_format(): - sent = f"{sent} {bw_unit}" - received = f"{received} {bw_unit}" - pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" - if self.logger.is_json_format(): - sent = {"value" : sent, - "unit" : bw_unit} - received = {"value" : received, - "unit" : bw_unit} - pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], - "unit" : packet_size_unit} - - pcie_dict['current_bandwidth_sent'] = sent - pcie_dict['current_bandwidth_received'] = received - pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) - values_dict['pcie'] = pcie_dict + if "ecc" in current_platform_args: if args.ecc: ecc_count = {} @@ -2806,23 +2854,28 @@ class AMDSMICommands(): args.gpu = [args.gpu] print('EVENT LISTENING:\n') - print('Press q and hit ENTER when you want to stop (listening will stop within 10 seconds)') - + print('Press q and hit ENTER when you want to stop.') + self.stop = False threads = [] for device_handle in range(len(args.gpu)): x = threading.Thread(target=self._event_thread, args=(self, device_handle)) threads.append(x) x.start() - while self.stop!= 'q': - self.stop = input("") + while True: + user_input = input() + if user_input == 'q': + print("Escape Sequence Detected; Exiting") + self.stop = True + break for thread in threads: thread.join() def topology(self, args, multiple_devices=False, gpu=None, access=None, - weight=None, hops=None, link_type=None, numa_bw=None): + weight=None, hops=None, link_type=None, numa_bw=None, + coherent=None, atomics=None, dma=None, bi_dir=None): """ Get topology information for target gpus params: args - argparser args to pass to subcommand @@ -2833,6 +2886,10 @@ class AMDSMICommands(): hops (bool) - Value override for args.hops type (bool) - Value override for args.type numa_bw (bool) - Value override for args.numa_bw + coherent (bool) - Value override for args.coherent + atomics (bool) - Value override for args.atomics + dma (bool) - Value override for args.dma + bi_dir (bool) - Value override for args.bi_dir return: Nothing """ @@ -2849,6 +2906,14 @@ class AMDSMICommands(): args.link_type = link_type if numa_bw: args.numa_bw = numa_bw + if coherent: + args.coherent = coherent + if atomics: + args.atomics = atomics + if dma: + args.dma = dma + if bi_dir: + args.bi_dir = bi_dir # Handle No GPU passed if args.gpu == None: @@ -2858,8 +2923,10 @@ class AMDSMICommands(): args.gpu = [args.gpu] # Handle all args being false - if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw]): - args.access = args.weight = args.hops = args.link_type= args.numa_bw = True + if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw, + args.coherent, args.atomics, args.dma, args.bi_dir]): + args.access = args.weight = args.hops = args.link_type= args.numa_bw = \ + args.coherent = args.atomics = args.dma = args.bi_dir = True # Clear the table header self.logger.table_header = ''.rjust(12) @@ -2890,6 +2957,10 @@ class AMDSMICommands(): # "num_hops": num_hops - # of hops between devices # "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes # "N/A" - self node or not connected devices + # "coherent": coherent - Coherant / Non-Coherant io links + # "atomics": atomics - 32 and 64-bit atomic io link capability between nodes + # "dma": dma - P2P direct memory access (DMA) link capability between nodes + # "bi_dir": bi_dir - P2P bi-directional link capability between nodes # } for dest_gpu_index, dest_gpu in enumerate(args.gpu): @@ -2928,6 +2999,42 @@ class AMDSMICommands(): else: link_status = "DISABLED" + link_coherent = "SELF" + link_atomics = "SELF" + link_dma = "SELF" + link_bi_dir = "SELF" + + if src_gpu != dest_gpu: + try: + cap = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap'] + link_coherent = ( + "C" if cap['is_iolink_coherent'] == 1 else + "NC" if cap['is_iolink_coherent'] == 0 else + "N/A" + ) + link_atomics = ( + "64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else + "32" if cap['is_iolink_atomics_32bit'] == 1 else + "64" if cap['is_iolink_atomics_64bit'] == 1 else + "N/A" + ) + link_dma = ( + "T" if cap['is_iolink_dma'] == 1 else + "F" if cap['is_iolink_dma'] == 0 else + "N/A" + ) + link_bi_dir = ( + "T" if cap['is_iolink_bi_directional'] == 1 else + "F" if cap['is_iolink_bi_directional'] == 0 else + "N/A" + ) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get link status for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + # link_status = amdsmi_is_P2P_accessible(src,dest) dest_gpu_links = { "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu), @@ -2937,6 +3044,10 @@ class AMDSMICommands(): "link_type": link_type, "num_hops": num_hops, "bandwidth": numa_bw, + "coherent": link_coherent, + "atomics": link_atomics, + "dma": link_dma, + "bi_dir": link_bi_dir } if not args.access: del dest_gpu_links['link_status'] @@ -2948,6 +3059,14 @@ class AMDSMICommands(): del dest_gpu_links['num_hops'] if not args.numa_bw: del dest_gpu_links['bandwidth'] + if not args.coherent: + del dest_gpu_links['coherent'] + if not args.atomics: + del dest_gpu_links['atomics'] + if not args.dma: + del dest_gpu_links['dma'] + if not args.bi_dir: + del dest_gpu_links['bi_dir'] links.append(dest_gpu_links) dest_end = dest_gpu_index+1 == len(args.gpu) isEndOfSrc = src_gpu_index+1 == len(args.gpu) @@ -3165,6 +3284,175 @@ class AMDSMICommands(): self.logger.table_title = "NUMA BW TABLE" self.logger.print_output(multiple_device_enabled=True, tabular=True) + if args.coherent: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_coherent = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_coherent[dest_gpu_key] = "SELF" + continue + try: + iolink_coherent = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_coherent'] + src_gpu_coherent[dest_gpu_key] = "C" if iolink_coherent == 1 else "NC" if iolink_coherent == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_coherent[dest_gpu_key] = "N/A" + logging.debug("Failed to get link coherent for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['coherent'] = src_gpu_coherent + + tabular_output_dict.update(src_gpu_coherent) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "CACHE COHERANCY TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.atomics: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_atomics = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_atomics[dest_gpu_key] = "SELF" + continue + try: + cap = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap'] + src_gpu_atomics[dest_gpu_key] = ( + "64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else + "32" if cap['is_iolink_atomics_32bit'] == 1 else + "64" if cap['is_iolink_atomics_64bit'] == 1 else + "N/A" + ) + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_atomics[dest_gpu_key] = "N/A" + logging.debug("Failed to get link atomics for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['atomics'] = src_gpu_atomics + + tabular_output_dict.update(src_gpu_atomics) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "ATOMICS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.dma: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_dma = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_dma[dest_gpu_key] = "SELF" + continue + try: + iolink_dma = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_dma'] + src_gpu_dma[dest_gpu_key] = "T" if iolink_dma == 1 else "F" if iolink_dma == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_dma[dest_gpu_key] = "N/A" + logging.debug("Failed to get link dma for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['dma'] = src_gpu_dma + + tabular_output_dict.update(src_gpu_dma) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "DMA TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.bi_dir: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_bi_dir = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_bi_dir[dest_gpu_key] = "SELF" + continue + try: + iolink_bi_dir = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_bi_directional'] + src_gpu_bi_dir[dest_gpu_key] = "T" if iolink_bi_dir == 1 else "F" if iolink_bi_dir == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_bi_dir[dest_gpu_key] = "N/A" + logging.debug("Failed to get link bi-directional for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['bi_dir'] = src_gpu_bi_dir + + tabular_output_dict.update(src_gpu_bi_dir) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "BI-DIRECTIONAL TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if self.logger.is_human_readable_format(): + # Populate the legend output + legend_parts = [ + "\n\nLegend:", + " SELF = Current GPU", + " ENABLED / DISABLED = Link is enabled or disabled", + " N/A = Not supported", + " T/F = True / False", + " C/NC = Coherant / Non-Coherant io links", + " 64,32 = 64 bit and 32 bit atomic support", + " -" + ] + legend_output = "\n".join(legend_parts) + + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') + self.logger.multiple_device_output = topo_values if self.logger.is_csv_format(): @@ -3419,7 +3707,7 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, memory_partition=None, power_cap=None, soc_pstate=None, xgmi_plpd = None, - process_isolation=None): + process_isolation=None, clk_limit=None): """Issue reset commands to target gpu(s) Args: @@ -3466,6 +3754,8 @@ class AMDSMICommands(): args.xgmi_plpd = xgmi_plpd if process_isolation: args.process_isolation = process_isolation + if clk_limit: + args.clk_limit = clk_limit # Handle No GPU passed if args.gpu == None: @@ -3488,7 +3778,8 @@ class AMDSMICommands(): args.power_cap is not None, args.soc_pstate is not None, args.xgmi_plpd is not None, - args.process_isolation is not None]): + args.process_isolation is not None, + args.clk_limit is not None]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3614,6 +3905,17 @@ class AMDSMICommands(): raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e self.logger.store_output(args.gpu, 'process_isolation', result) + if isinstance(args.clk_limit, tuple): + try: + clk_type = args.clk_limit.clk_type + lim_type = args.clk_limit.lim_type + val = args.clk_limit.val + amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}") if multiple_devices: self.logger.store_multiple_device_output() @@ -3629,7 +3931,7 @@ class AMDSMICommands(): cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None, core=None, core_boost_limit=None, soc_pstate=None, xgmi_plpd=None, - process_isolation=None): + process_isolation=None, clk_limit=None): """Issue reset commands to target gpu(s) Args: @@ -3680,8 +3982,8 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", "process_isolation", - ] + "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", + "process_isolation", "clk_limit"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3708,9 +4010,9 @@ class AMDSMICommands(): break # Only allow one device's arguments to be set at a time - if gpu_args_enabled == cpu_args_enabled == core_args_enabled == False: + if not any([gpu_args_enabled, cpu_args_enabled, core_args_enabled]): raise ValueError('No GPU, CPU, or CORE arguments provided, specific arguments are needed') - elif gpu_args_enabled == cpu_args_enabled == core_args_enabled == True: + elif all([gpu_args_enabled, cpu_args_enabled, core_args_enabled]): raise ValueError('Cannot set GPU, CPU, and CORE arguments at the same time') elif not (gpu_args_enabled ^ cpu_args_enabled ^ core_args_enabled): raise ValueError('Cannot set GPU, CPU, or CORE arguments at the same time') @@ -3737,7 +4039,7 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, soc_pstate, xgmi_plpd, - process_isolation) + process_isolation, clk_limit) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3757,7 +4059,7 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, soc_pstate, xgmi_plpd, - process_isolation) + process_isolation, clk_limit) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, @@ -4124,6 +4426,15 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'timestamp', int(time.time())) self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header + # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls + if args.pcie: + try: + pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + except amdsmi_exception.AmdSmiLibraryException as e: + pcie_info = "N/A" + logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) + + # Resume regular ordering of values if args.power_usage: try: gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) @@ -4365,15 +4676,13 @@ class AMDSMICommands(): self.logger.table_header += 'VRAM_USED'.rjust(11) self.logger.table_header += 'VRAM_TOTAL'.rjust(12) if args.pcie: - try: - pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + if pcie_info != "N/A": pcie_bw_unit = 'Mb/s' monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit) - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['pcie_bw'] = "N/A" - logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) + else: + monitor_values['pcie_bw'] = pcie_info - self.logger.table_header += 'PCIE_BW'.rjust(10) + self.logger.table_header += 'PCIE_BW'.rjust(12) self.logger.store_output(args.gpu, 'values', monitor_values) @@ -4689,9 +4998,9 @@ class AMDSMICommands(): amdsmi_interface.AmdSmiEvtNotificationType) values_dict = {} - while self.stop!='q': + while not self.stop: try: - events = listener.read(10000) + events = listener.read(2000) for event in events: values_dict["event"] = event["event"] values_dict["message"] = event["message"] diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index d1f1da1184..b9dcca61ba 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in diff --git a/amdsmi_cli/amdsmi_init.py b/amdsmi_cli/amdsmi_init.py index a5852293a4..1fd6de03f5 100644 --- a/amdsmi_cli/amdsmi_init.py +++ b/amdsmi_cli/amdsmi_init.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index 12fdd0faf8..8234f99eac 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -27,10 +27,19 @@ import time from typing import Dict from enum import Enum import yaml +import inspect from amdsmi_helpers import AMDSMIHelpers import amdsmi_cli_exceptions +### Custom YAML Functions +# Dumper class to preserve order of yaml.dump +class CustomDumper(yaml.Dumper): + def represent_dict_preserve_order(self, data): + return self.represent_dict(data.items()) +def has_sort_keys_option(): # to check if sort_keys is available + return 'sort_keys' in inspect.signature(yaml.dump).parameters + class AMDSMILogger(): def __init__(self, format='human_readable', destination='stdout') -> None: self.output = {} @@ -121,7 +130,7 @@ class AMDSMILogger(): table_values += string_value.rjust(7) elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'): table_values += string_value.rjust(11) - elif key == 'vram_total' or 'ecc' in key: + elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw': table_values += string_value.rjust(12) elif key in ['pcie_replay']: table_values += string_value.rjust(13) @@ -202,8 +211,14 @@ class AMDSMILogger(): capitalized_json["AMDSMI_SPACING_REMOVAL"] = tabbed_dictionary json_string = json.dumps(capitalized_json, indent=4) - yaml_data = yaml.safe_load(json_string) - yaml_output = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True) + + if has_sort_keys_option(): + yaml_data = yaml.safe_load(json_string) + yaml_output = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True) + else: + CustomDumper.add_representer(dict, CustomDumper.represent_dict_preserve_order) + yaml_data = yaml.safe_load(json_string) + yaml_output = yaml.dump(yaml_data, Dumper=CustomDumper, allow_unicode=True, default_flow_style=False) # Remove a key line if it is a spacer yaml_output = yaml_output.replace("AMDSMI_SPACING_REMOVAL:\n", "") diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 0a85ee7b01..be58f7b0fe 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -26,6 +26,9 @@ import errno import os import sys import time +import collections +from typing import Optional +from typing import Union from pathlib import Path @@ -172,6 +175,27 @@ class AMDSMIParser(argparse.ArgumentParser): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat) + def _limit_select(self): + """Custom action for setting clock limits""" + output_format = self.helpers.get_output_format() + + class AMDSMILimitArgs(argparse.Action): + def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, + values: Union[str, list, None], option_string: Optional[str] = None) -> None: + # valid values + valid_clk_types = ('sclk', 'mclk') + valid_lim_types = ('min', 'max') + clk_type, lim_type, val = values + if clk_type not in valid_clk_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(clk_type, output_format) + if lim_type not in valid_lim_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(lim_type, output_format) + val = int(val) + clk_limit_args = collections.namedtuple('clk_limit_args', ['clk_type', 'lim_type', 'val']) + setattr(namespace, self.dest, clk_limit_args(clk_type, lim_type, val)) + return AMDSMILimitArgs + + def _check_output_file_path(self): """ Argument action validator: Returns a path to a file from the output file path provided. @@ -398,15 +422,6 @@ class AMDSMIParser(argparse.ArgumentParser): return value - def _validate_positive(self, value): - i_value = int(value) - if i_value < 0: - outputformat = self.helpers.get_output_format() - raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(i_value, outputformat) - - return i_value - - def _add_device_arguments(self, subcommand_parser, required=False): # Device arguments help text gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}" @@ -551,9 +566,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text list_help = "List GPU information" - list_subcommand_help = "Lists all the devices on the system and the links between devices.\ - \nLists all the sockets and for each socket, GPUs and/or CPUs associated to\ - \nthat socket alongside some basic information for each device.\ + list_subcommand_help = "Lists all detected devices on the system\ + \nLists the BDF, UUID, KFD_ID, and NODE_ID for each GPU and/or CPUs\ \nIn virtualization environments, it can also list VFs associated to each\ \nGPU with some basic information for each VF." @@ -834,7 +848,7 @@ class AMDSMIParser(argparse.ArgumentParser): cpu_group.add_argument('--cpu-prochot', action='store_true', required=False, help=cpu_proc_help) cpu_group.add_argument('--cpu-freq-metrics', action='store_true', required=False, help=cpu_freq_help) cpu_group.add_argument('--cpu-c0-res', action='store_true', required=False, help=cpu_c0_res_help) - cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._validate_positive, + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("NBIOID"), help=cpu_lclk_dpm_help) cpu_group.add_argument('--cpu-pwr-svi-telemtry-rails', action='store_true', required=False, help=cpu_pwr_svi_telemtry_rails_help) @@ -972,6 +986,10 @@ class AMDSMIParser(argparse.ArgumentParser): hops_help = "Displays the number of hops between GPUs" link_type_help = "Displays the link type between GPUs" numa_bw_help = "Display max and min bandwidth between nodes" + coherent_help = "Display cache coherant (or non-coherant) link capability between nodes" + atomics_help = "Display 32 and 64-bit atomic io link capability between nodes" + dma_help = "Display P2P direct memory access (DMA) link capability between nodes" + bi_dir_help = "Display P2P bi-directional link capability between nodes" # Create topology subparser topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) @@ -989,6 +1007,10 @@ class AMDSMIParser(argparse.ArgumentParser): topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) + topology_parser.add_argument('-c', '--coherent', action='store_true', required=False, help=coherent_help) + topology_parser.add_argument('-n', '--atomics', action='store_true', required=False, help=atomics_help) + topology_parser.add_argument('-d', '--dma', action='store_true', required=False, help=dma_help) + topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help) def _add_set_value_parser(self, subparsers, func): @@ -1015,6 +1037,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_soc_pstate_help = "Set the GPU soc pstate policy using policy id\n" set_xgmi_plpd_help = "Set the GPU XGMI per-link power down policy using policy id\n" set_process_isolation_help = "Enable or disable the GPU process isolation: 0 for disable and 1 for enable.\n" + set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \nOf form: amd-smi set -L (sclk | mclk) (min | max) value" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -1053,6 +1076,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--soc-pstate', action='store', required=False, type=self._not_negative_int, help=set_soc_pstate_help, metavar='POLICY_ID') set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_parser.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS') @@ -1060,20 +1084,20 @@ class AMDSMIParser(argparse.ArgumentParser): if self.helpers.is_baremetal(): # Optional CPU Args cpu_group = set_value_parser.add_argument_group("CPU Arguments") - cpu_group.add_argument('--cpu-pwr-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("PWR_LIMIT"), help=set_cpu_pwr_limit_help) - cpu_group.add_argument('--cpu-xgmi-link-width', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) - cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._validate_positive, nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"), help=set_cpu_lclk_dpm_level_help) - cpu_group.add_argument('--cpu-pwr-eff-mode', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) - cpu_group.add_argument('--cpu-gmi3-link-width', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) - cpu_group.add_argument('--cpu-pcie-link-rate', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_link_rate_help) - cpu_group.add_argument('--cpu-df-pstate-range', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) + cpu_group.add_argument('--cpu-pwr-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("PWR_LIMIT"), help=set_cpu_pwr_limit_help) + cpu_group.add_argument('--cpu-xgmi-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"), help=set_cpu_lclk_dpm_level_help) + cpu_group.add_argument('--cpu-pwr-eff-mode', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) + cpu_group.add_argument('--cpu-gmi3-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) + cpu_group.add_argument('--cpu-pcie-link-rate', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_link_rate_help) + cpu_group.add_argument('--cpu-df-pstate-range', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) cpu_group.add_argument('--cpu-enable-apb', action='store_true', required=False, help=set_cpu_enable_apb_help) - cpu_group.add_argument('--cpu-disable-apb', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("DF_PSTATE"), help=set_cpu_disable_apb_help) - cpu_group.add_argument('--soc-boost-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) + cpu_group.add_argument('--cpu-disable-apb', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("DF_PSTATE"), help=set_cpu_disable_apb_help) + cpu_group.add_argument('--soc-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) # Optional CPU Core Args core_group = set_value_parser.add_argument_group("CPU Core Arguments") - core_group.add_argument('--core-boost-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) + core_group.add_argument('--core-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) # Add command modifiers to the bottom self._add_command_modifiers(set_value_parser) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake old mode 100755 new mode 100644 index 7131761b67..40a1cfc9de --- a/cmake_modules/utils.cmake +++ b/cmake_modules/utils.cmake @@ -3,7 +3,7 @@ ## The University of Illinois/NCSA ## Open Source License (NCSA) ## -## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved. +## Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. ## ## Developed by: ## diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 0b62c6bc91..a322c17a14 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -48,7 +48,7 @@ PROJECT_NAME = AMD SMI # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.6.3.0" +PROJECT_NUMBER = "24.6.5.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/how-to/using-AMD-SMI-CLI-tool.md b/docs/how-to/using-AMD-SMI-CLI-tool.md index e35f0a7c6e..0ad610a81b 100644 --- a/docs/how-to/using-AMD-SMI-CLI-tool.md +++ b/docs/how-to/using-AMD-SMI-CLI-tool.md @@ -8,7 +8,7 @@ AMD-SMI reports the version and current platform detected when running the comma ~$ amd-smi usage: amd-smi [-h] ... -AMD System Management Interface | Version: 24.6.3.0 | ROCm version: 6.2.1 | Platform: Linux Baremetal +AMD System Management Interface | Version: 24.6.5.0 | ROCm version: 6.2.2 | Platform: Linux Baremetal options: -h, --help show this help message and exit diff --git a/docs/how-to/using-amdsmi-for-python.md b/docs/how-to/using-amdsmi-for-python.md index 7a454b1025..40edc84f8c 100644 --- a/docs/how-to/using-amdsmi-for-python.md +++ b/docs/how-to/using-amdsmi-for-python.md @@ -377,6 +377,8 @@ Field | Content `rev_id` | revision id `asic_serial` | asic serial `oam_id` | oam id +`num_of_compute_units` | number of compute units on asic +`target_graphics_version` | hardware graphics version Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function: @@ -394,13 +396,44 @@ try: else: for device in devices: asic_info = amdsmi_get_gpu_asic_info(device) - print(asic_info['market_name']) - print(hex(asic_info['vendor_id'])) - print(asic_info['vendor_name']) - print(hex(asic_info['device_id'])) - print(hex(asic_info['rev_id'])) - print(asic_info['asic_serial']) - print(asic_info['oam_id']) + print(asic_info) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_gpu_kfd_info + +Description: Returns KFD(kernel fusion driver) information for the given GPU +This correlates to GUID in rocm-smi + +Input parameters: + +* `processor_handle` device which to query + +Output: Dictionary with fields + +Field | Content +---|--- +`kfd_id` | KFD's unique GPU identifier +`node_id` | KFD's internal GPU index + +Exceptions that can be thrown by `amdsmi_get_gpu_kfd_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + kfd_info = amdsmi_get_gpu_kfd_info(device) + print(kfd_info) except AmdSmiException as e: print(e) ``` @@ -810,7 +843,7 @@ except AmdSmiException as e: ### amdsmi_get_pcie_info -Description: Returns the pcie metric and static information for the given GPU. +Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms It is not supported on virtual machine guest Input parameters: @@ -1925,19 +1958,19 @@ except AmdSmiException as e: ### amdsmi_get_utilization_count -Description: Get coarse grain utilization counter of the specified device +Description: Get coarse/fine grain utilization counter of the specified device Input parameters: * `processor_handle` handle for the given device -* `counter_types` variable number of counter types desired +* `counter_types` List of AmdSmiUtilizationCounterType counters requested Output: List containing dictionaries with fields Field | Description ---|--- `timestamp` | The timestamp when the counter is retreived - Resolution: 1 ns -`Dictionary for each counter` |
Subfield Description
`type`Type of utilization counter
`value`Value gotten for utilization counter
+`Dictionary for each counter` |
Subfield Description
`type`Counter that was requested
`value`Value gotten for utilization counter
Exceptions that can be thrown by `amdsmi_get_utilization_count` function: @@ -1957,13 +1990,17 @@ try: utilization = amdsmi_get_utilization_count( device, AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY - ) + ) print(utilization) utilization = amdsmi_get_utilization_count( device, - AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, - AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY - ) + [AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, + AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY, + AmdSmiUtilizationCounterType.COARSE_DECODER_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_GRAIN_GFX_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_GRAIN_MEM_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_DECODER_ACTIVITY] + ) print(utilization) except AmdSmiException as e: print(e) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index e8ef3d80d9..8be267e6f6 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc index 19e8cf5947..bcfca83681 100644 --- a/example/amd_smi_nodrm_example.cc +++ b/example/amd_smi_nodrm_example.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/example/amdsmi_esmi_intg_example.cc b/example/amdsmi_esmi_intg_example.cc index 0041e0b11f..03d1f4454d 100644 --- a/example/amdsmi_esmi_intg_example.cc +++ b/example/amdsmi_esmi_intg_example.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/goamdsmi.go b/goamdsmi.go new file mode 100644 index 0000000000..4dffe3784c --- /dev/null +++ b/goamdsmi.go @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sellcopies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Except as contained in this notice, the name of the Advanced Micro Devices, + * Inc. shall not be used in advertising or otherwise to promote the sale, use + * or other dealings in this Software without prior written authorization from + * the Advanced Micro Devices, Inc. + * + */ + +package goamdsmi + +/* +#cgo CFLAGS: -Wall -I/opt/rocm/include +#cgo LDFLAGS: -L/opt/rocm/lib -L/opt/rocm/lib64 -lgoamdsmi_shim64 -Wl,--unresolved-symbols=ignore-in-object-files +#include +#include +*/ +import "C" + +//GPU ROCM or AMDSMI calls +func GO_gpu_init() (bool) { + return bool(C.goamdsmi_gpu_init()) +} + +func GO_gpu_shutdown() (bool) { + return bool(C.goamdsmi_gpu_shutdown()) +} + +func GO_gpu_num_monitor_devices() (uint) { + return uint(C.goamdsmi_gpu_num_monitor_devices()) +} + +func GO_gpu_dev_name_get(i int) (*C.char) { + return C.goamdsmi_gpu_dev_name_get(C.uint(i)) +} + +func GO_gpu_dev_id_get(i int) (C.uint16_t) { + return C.uint16_t(C.goamdsmi_gpu_dev_id_get(C.uint(i))) +} + +func GO_gpu_dev_pci_id_get(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_pci_id_get(C.uint(i)) +} + +func GO_gpu_dev_vbios_version_get(i int) (*C.char) { + return C.goamdsmi_gpu_dev_vbios_version_get(C.uint(i)) +} + +func GO_gpu_dev_vendor_name_get(i int) (*C.char) { + return C.goamdsmi_gpu_dev_vendor_name_get(C.uint(i)) +} + +func GO_gpu_dev_power_cap_get(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_power_cap_get(C.uint(i)) +} + +func GO_gpu_dev_power_get(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_power_get(C.uint(i)) +} + +func GO_gpu_dev_temp_metric_get(i int, sensor int, metric int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_temp_metric_get(C.uint(i), C.uint(sensor), C.uint(metric)) +} + +func GO_gpu_dev_perf_level_get(i int) (C.uint32_t) { + return C.goamdsmi_gpu_dev_perf_level_get(C.uint(i)) +} + +func GO_gpu_dev_overdrive_level_get(i int) (C.uint32_t) { + return C.goamdsmi_gpu_dev_perf_level_get(C.uint(i)) +} + +func GO_gpu_dev_mem_overdrive_level_get(i int) (C.uint32_t) { + return C.goamdsmi_gpu_dev_overdrive_level_get(C.uint(i)) +} + +func GO_gpu_dev_gpu_clk_freq_get_sclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_gpu_clk_freq_get_sclk(C.uint(i)) +} + +func GO_gpu_dev_gpu_clk_freq_get_mclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_gpu_clk_freq_get_mclk(C.uint(i)) +} + +func GO_gpu_od_volt_freq_range_min_get_sclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_od_volt_freq_range_min_get_sclk(C.uint(i)) +} + +func GO_gpu_od_volt_freq_range_min_get_mclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_od_volt_freq_range_min_get_mclk(C.uint(i)) +} + +func GO_gpu_od_volt_freq_range_max_get_sclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_od_volt_freq_range_max_get_sclk(C.uint(i)) +} + +func GO_gpu_od_volt_freq_range_max_get_mclk(i int) (C.uint64_t) { + return C.goamdsmi_gpu_od_volt_freq_range_max_get_mclk(C.uint(i)) +} + +func GO_gpu_dev_gpu_busy_percent_get(i int) (C.uint32_t) { + return C.goamdsmi_gpu_dev_gpu_busy_percent_get(C.uint(i)) +} + +func GO_gpu_dev_gpu_memory_busy_percent_get(i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_gpu_memory_busy_percent_get(C.uint(i)) +} + +func GO_gpu_dev_gpu_memory_usage_get (i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_gpu_memory_usage_get(C.uint(i)) +} + +func GO_gpu_dev_gpu_memory_total_get (i int) (C.uint64_t) { + return C.goamdsmi_gpu_dev_gpu_memory_total_get(C.uint(i)) +} + +//CPU ESMI or AMDSMI calls +func GO_cpu_init() (bool) { + return bool(C.goamdsmi_cpu_init()) +} + +func GO_cpu_number_of_sockets_get() (uint) { + return uint(C.goamdsmi_cpu_number_of_sockets_get()) +} + +func GO_cpu_number_of_threads_get() (uint) { + return uint(C.goamdsmi_cpu_number_of_threads_get()) +} + +func GO_cpu_threads_per_core_get() (uint) { + return uint(C.goamdsmi_cpu_threads_per_core_get()) +} + +func GO_cpu_core_energy_get(i int) (C.uint64_t) { + return C.goamdsmi_cpu_core_energy_get(C.uint(i)) +} + +func GO_cpu_core_boostlimit_get(i int) (C.uint32_t) { + return C.goamdsmi_cpu_core_boostlimit_get(C.uint(i)) +} + +func GO_cpu_socket_energy_get(i int) (C.uint64_t) { + return C.goamdsmi_cpu_socket_energy_get(C.uint(i)) +} + +func GO_cpu_socket_power_get(i int) (C.uint32_t) { + return C.goamdsmi_cpu_socket_power_get(C.uint(i)) +} + +func GO_cpu_socket_power_cap_get(i int) (C.uint32_t) { + return C.goamdsmi_cpu_socket_power_cap_get(C.uint(i)) +} + +func GO_cpu_prochot_status_get(i int) (C.uint32_t) { + return C.goamdsmi_cpu_prochot_status_get(C.uint(i)) +} diff --git a/goamdsmi_shim/CMakeLists.txt b/goamdsmi_shim/CMakeLists.txt new file mode 100644 index 0000000000..7a4a202bb0 --- /dev/null +++ b/goamdsmi_shim/CMakeLists.txt @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Advanced Micro Devices, Inc. + +# +# Minimum version of cmake required +# +cmake_minimum_required(VERSION 3.5.0) + +message("*******************************************************************") +message(" CMake AMD goamdsmi_shim Library ") +message("*******************************************************************") + +option(WITH_AMDSMI "Support unified cpu and gpu" ON) + +set(AMDSMI_DIR "" CACHE PATH "path to amdsmi installation") + +if(WITH_AMDSMI) +add_definitions(-DAMDSMI_BUILD) +add_definitions(-DENABLE_ESMI_LIB) +#include(deps/SetupAmdsmi.cmake) +endif() + +if(ENABLE_DEBUG_LEVEL) +add_definitions(-DENABLE_DEBUG_LEVEL=${ENABLE_DEBUG_LEVEL}) +message("**** Enabling Debug Level=${ENABLE_DEBUG_LEVEL} ****") +else() +add_definitions(-DENABLE_DEBUG_LEVEL=0) +endif() + +set(GOAMDSMI_SHIM "goamdsmi_shim") +set(GOAMDSMI_SHIM_LIB "goamdsmi") +set(GOAMDSMI_SHIM_COMPONENT "lib${GOAMDSMI_SHIM}") +set(GOAMDSMI_SHIM_TARGET "${GOAMDSMI_SHIM}64") + +# The following default version values should be updated as appropriate for +# ABI breaks (update MAJOR and MINOR), and ABI/API additions (update MINOR). +# Until ABI stabilizes VERSION_MAJOR will be 0. This should be over-ridden +# by git tags (through "git describe") when they are present. +set(VERSION_MAJOR 1) +set(VERSION_MINOR 0) +set(VERSION_PATCH 0) +set(VERSION_NUM_COMMIT 0) + +set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") + +set(${GOAMDSMI_SHIM}_VERSION_MAJOR "${VERSION_MAJOR}") +set(${GOAMDSMI_SHIM}_VERSION_MINOR "${VERSION_MINOR}") +set(${GOAMDSMI_SHIM}_VERSION_PATCH "0") +set(${GOAMDSMI_SHIM}_VERSION_BUILD "0") +message("SOVERSION: ${SO_VERSION_STRING}") + +project(${GOAMDSMI_SHIM_TARGET}) + +# Create a configure file to get version info from within library +configure_file( + "${PROJECT_SOURCE_DIR}/${GOAMDSMI_SHIM_TARGET}Config.in" + "${PROJECT_SOURCE_DIR}/include/${GOAMDSMI_SHIM_TARGET}Config.h") + +if (NOT DEFINED CPACK_PACKAGE_VENDOR) + set(CPACK_PACKAGE_VENDOR "AMD") +endif() + +if (NOT DEFINED CPACK_PACKAGE_CONTACT) + set(CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc.") +endif() + +if (NOT DEFINED CPACK_PACKAGE_DESCRIPTION_SUMMARY) +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY + "AMD CGO wrapper") +endif() + +if (NOT GOAMDSMI_SHIM_PACKAGE) + set(GOAMDSMI_SHIM_PACKAGE goamdsmi_shim_lib64) +endif() + +set(CPACK_PACKAGE_FILE_NAME "${GOAMDSMI_SHIM_PACKAGE}-${SO_VERSION_STRING}") +## Verbose output. +set(CMAKE_VERBOSE_MAKEFILE on) + +## Compiler flags +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -fpic -fno-rtti -m64") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -std=c++11 ") +# Use this instead of above for 32 bit +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") + +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +else () + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") +endif () + +set(go_amd_smi_headers) + +set(go_amd_smi_sources) + +set(go_amd_smi_includes) + + +add_subdirectory(smiwrapper) +list(APPEND go_amd_smi_headers smiwrapper/goamdsmi.h ${go_amd_smi_headers}) +list(APPEND go_amd_smi_headers smiwrapper/amdsmi_go_shim.h ${go_amd_smi_headers}) +list(APPEND go_amd_smi_sources smiwrapper/amdsmi_go_shim.c) +list(APPEND go_amd_smi_includes ${CMAKE_CURRENT_SOURCE_DIR}/smiwrapper) + +add_library(${GOAMDSMI_SHIM_TARGET} SHARED + ${go_amd_smi_sources} + ${go_amd_smi_headers} + ${go_amd_smi_includes}) + +target_link_libraries(${GOAMDSMI_SHIM_TARGET} pthread rt m) + +if(WITH_AMDSMI) + target_link_libraries(${GOAMDSMI_SHIM_TARGET} amd_smi) + target_link_libraries(${GOAMDSMI_SHIM_TARGET} -L${AMDSMI_DIR}/lib) + target_link_libraries(${GOAMDSMI_SHIM_TARGET} -L${AMDSMI_DIR}/lib64) +endif() + +## Set the VERSION and SOVERSION values +set_property(TARGET ${GOAMDSMI_SHIM_TARGET} PROPERTY + SOVERSION "${VERSION_MAJOR}") +set_property(TARGET ${GOAMDSMI_SHIM_TARGET} PROPERTY + VERSION "${SO_VERSION_STRING}") + +## If the library is a release, strip the target library +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + add_custom_command( + TARGET ${GOAMDSMI_SHIM_TARGET} + POST_BUILD COMMAND ${CMAKE_STRIP} lib${GOAMDSMI_SHIM_TARGET}.so) +endif () + +set(go_amd_smi_install_headers + smiwrapper/goamdsmi.h + smiwrapper/amdsmi_go_shim.h +) + +## Add the install directives for the runtime library. +install(TARGETS ${GOAMDSMI_SHIM_TARGET} + LIBRARY DESTINATION lib COMPONENT ${GOAMDSMI_SHIM_COMPONENT}) +install(FILES ${go_amd_smi_install_headers} + DESTINATION include) + +include_directories(${go_amd_smi_includes}) diff --git a/goamdsmi_shim/goamdsmi_shim64Config.in b/goamdsmi_shim/goamdsmi_shim64Config.in new file mode 100644 index 0000000000..b12a6f0b59 --- /dev/null +++ b/goamdsmi_shim/goamdsmi_shim64Config.in @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Advanced Micro Devices, Inc. + +#ifndef INCLUDE_GOAMDSMI_SHIM64CONFIG_H_ +#define INCLUDE_GOAMDSMI_SHIM64CONFIG_H_ + +// This file is generated on build. + +#define goamdsmi_shim_VERSION_MAJOR @goamdsmi_shim_VERSION_MAJOR@ +#define goamdsmi_shim_VERSION_MINOR @goamdsmi_shim_VERSION_MINOR@ +#define goamdsmi_shim_VERSION_PATCH @goamdsmi_shim_VERSION_PATCH@ +#define goamdsmi_shim_VERSION_BUILD "@goamdsmi_shim_VERSION_BUILD@" + +#endif // INCLUDE_GOAMDSMI_SHIM_SMI64CONFIG_H_ diff --git a/goamdsmi_shim/smiwrapper/CMakeLists.txt b/goamdsmi_shim/smiwrapper/CMakeLists.txt new file mode 100644 index 0000000000..8274335abb --- /dev/null +++ b/goamdsmi_shim/smiwrapper/CMakeLists.txt @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024, Advanced Micro Devices, Inc. + +set(go_amd_smi_headers + ${CMAKE_CURRENT_SOURCE_DIR}/goamdsmi.h + ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_go_shim.h + CACHE INTERNAL "") + +set(go_amd_smi_sources + ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_go_shim.c + CACHE INTERNAL "") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${go_amd_smi_amdsmi_includes}) + +add_library(go_amd_smi_ OBJECT + ${go_amd_smi_sources} + ${go_amd_smi_headers}) + +### Shared libraries need PIC +set_property(TARGET ${go_amd_smi_} PROPERTY POSITION_INDEPENDENT_CODE 1) + diff --git a/goamdsmi_shim/smiwrapper/amdsmi_go_shim.c b/goamdsmi_shim/smiwrapper/amdsmi_go_shim.c new file mode 100644 index 0000000000..599b44a307 --- /dev/null +++ b/goamdsmi_shim/smiwrapper/amdsmi_go_shim.c @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sellcopies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Except as contained in this notice, the name of the Advanced Micro Devices, + * Inc. shall not be used in advertising or otherwise to promote the sale, use + * or other dealings in this Software without prior written authorization from + * the Advanced Micro Devices, Inc. + * + */ + +#include +#include +#include "amdsmi_go_shim.h" +#ifdef AMDSMI_BUILD +#include +#endif +#include +#define nullptr ((void*)0) + +#ifdef AMDSMI_BUILD +#define MAX_SOCKET_ACROSS_SYSTEM 4 +#define CPU_0 0 +#define GPU_SENSOR_0 0 +#define MAX_CPU_PER_SOCKET 4 +#define MAX_PHYSICALCORE_ACROSS_SYSTEM 384 +#define MAX_LOGICALCORE_ACROSS_SYSTEM 768 +#define MAX_GPU_DEVICE_ACROSS_SYSTEM 24 +#define MAX_GPU_POWER_FROM_DRIVER 0xFFFF + +#define AMDSMI_DRIVER_NAME "AMDSMI" +#define AMDSMI_LIB_FILE "/opt/rocm/lib/libamd_smi.so" +#define AMDSMI_LIB64_FILE "/opt/rocm/lib64/libamd_smi.so" + +#define AMDGPU_DRIVER_NAME "AMDGPUDriver" +#define AMDGPU_INITSTATE_FILE "/sys/module/amdgpu/initstate" + +#define AMDHSMP_DRIVER_NAME "AMDHSMPDriver" +#define AMDHSMP_INITSTATE_FILE "/sys/module/amd_hsmp/initstate" + +static uint32_t num_apuSockets = GOAMDSMI_VALUE_0; +static uint32_t num_cpuSockets = GOAMDSMI_VALUE_0; +static uint32_t num_gpuSockets = GOAMDSMI_VALUE_0; +static uint32_t cpuInitCompleted = false; +static uint32_t gpuInitCompleted = false; +static uint32_t apuInitCompleted = false; + +static uint32_t num_cpu_inAllSocket = GOAMDSMI_VALUE_0; +static uint32_t num_cpu_physicalCore_inAllSocket = GOAMDSMI_VALUE_0; +static uint32_t num_gpu_devices_inAllSocket = GOAMDSMI_VALUE_0; + +static amdsmi_socket_handle amdsmi_apusocket_handle_all_socket[MAX_SOCKET_ACROSS_SYSTEM+MAX_GPU_DEVICE_ACROSS_SYSTEM] = {0}; +static amdsmi_socket_handle amdsmi_cpusocket_handle_all_socket[MAX_SOCKET_ACROSS_SYSTEM] = {0}; +static amdsmi_socket_handle amdsmi_gpusocket_handle_all_socket[MAX_GPU_DEVICE_ACROSS_SYSTEM] = {0}; +static amdsmi_processor_handle amdsmi_processor_handle_all_cpu_across_socket[MAX_SOCKET_ACROSS_SYSTEM*MAX_CPU_PER_SOCKET] = {0}; +static amdsmi_processor_handle amdsmi_processor_handle_all_cpu_physicalCore_across_socket[MAX_PHYSICALCORE_ACROSS_SYSTEM] = {0}; +static amdsmi_processor_handle amdsmi_processor_handle_all_gpu_device_across_socket[MAX_GPU_DEVICE_ACROSS_SYSTEM] = {0}; + +goamdsmi_status_t is_file_present(const char* driver_name, const char* file_name) +{ + if(0 == access(file_name, F_OK)) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, %s found \"%s\" and returns:%d\n", driver_name, file_name, GOAMDSMI_STATUS_SUCCESS);} + return GOAMDSMI_STATUS_SUCCESS; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, %s not found, missing \"%s\" and returns:%d\n", driver_name, file_name, GOAMDSMI_STATUS_FAILURE);} + return GOAMDSMI_STATUS_FAILURE; +} + +goamdsmi_status_t go_shim_amdsmi_present() +{ + if((GOAMDSMI_STATUS_SUCCESS == is_file_present(AMDSMI_DRIVER_NAME, AMDSMI_LIB_FILE)) || (GOAMDSMI_STATUS_SUCCESS == is_file_present(AMDSMI_DRIVER_NAME, AMDSMI_LIB64_FILE))) + { + return GOAMDSMI_STATUS_SUCCESS; + } + return GOAMDSMI_STATUS_FAILURE; +} + +goamdsmi_status_t check_amdgpu_driver() +{ + return is_file_present(AMDGPU_DRIVER_NAME, AMDGPU_INITSTATE_FILE); +} + +goamdsmi_status_t check_hsmp_driver() +{ + return is_file_present(AMDHSMP_DRIVER_NAME, AMDHSMP_INITSTATE_FILE); +} + +goamdsmi_status_t go_shim_amdsmiapu_init(goamdsmi_Init_t goamdsmi_Init) +{ + if((GOAMDSMI_CPU_INIT == goamdsmi_Init) && (true == cpuInitCompleted)) + { + if((GOAMDSMI_VALUE_0 == num_cpuSockets)||(GOAMDSMI_VALUE_0 == num_cpu_inAllSocket)||(GOAMDSMI_VALUE_0 == num_cpu_physicalCore_inAllSocket)) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, Returns previous enumurated AMDSMICPUInit:%d, CpuSocketCount:%d, CpuCount:%d, CpuPhysicalCoreCount:%d\n", GOAMDSMI_STATUS_FAILURE, num_cpuSockets, num_cpu_inAllSocket, num_cpu_physicalCore_inAllSocket);} + return GOAMDSMI_STATUS_FAILURE; + } + else + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, Returns previous enumurated AMDSMICPUInit:%d, CpuSocketCount:%d, CpuCount:%d, CpuPhysicalCoreCount:%d\n", GOAMDSMI_STATUS_SUCCESS, num_cpuSockets, num_cpu_inAllSocket, num_cpu_physicalCore_inAllSocket);} + return GOAMDSMI_STATUS_SUCCESS; + } + } + + if((GOAMDSMI_GPU_INIT == goamdsmi_Init) && (true == gpuInitCompleted)) + { + if((GOAMDSMI_VALUE_0 == num_gpuSockets)||(GOAMDSMI_VALUE_0 == num_gpu_devices_inAllSocket)) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, Returns previous enumurated AMDSMIGPUInit:%d, GpuSocketCount:%d, GpuCount:%d\n", GOAMDSMI_STATUS_FAILURE, num_gpuSockets, num_gpu_devices_inAllSocket);} + return GOAMDSMI_STATUS_FAILURE; + } + else + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, Returns previous enumurated AMDSMIGPUInit:%d, GpuSocketCount:%d, GpuCount:%d\n", GOAMDSMI_STATUS_SUCCESS, num_gpuSockets, num_gpu_devices_inAllSocket);} + return GOAMDSMI_STATUS_SUCCESS; + } + } + +#if 0 + if(GOAMDSMI_STATUS_FAILURE == go_shim_amdsmi_present()) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, AMDSMI not present in the System, missing \"%s\" (or) \"%s\"\n", AMDSMI_LIB_FILE, AMDSMI_LIB64_FILE);} + return GOAMDSMI_STATUS_FAILURE; + } +#endif + + if ((GOAMDSMI_STATUS_SUCCESS == check_amdgpu_driver()) && (GOAMDSMI_STATUS_SUCCESS == check_hsmp_driver())) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, Identified APU machine and going to enumurate APU\n");} + + if( (AMDSMI_STATUS_SUCCESS == amdsmi_init(AMDSMI_INIT_AMD_APUS)) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_socket_handles(&num_apuSockets, nullptr)) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_socket_handles(&num_apuSockets, &amdsmi_apusocket_handle_all_socket[0])) && + (GOAMDSMI_VALUE_0 != num_apuSockets)) + { + cpuInitCompleted = true; + gpuInitCompleted = true; + apuInitCompleted = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, Identified APU machine ApuNumSockets=%d\n",num_apuSockets);} + for(uint32_t socket_counter = 0; socket_counter < num_apuSockets; socket_counter++) + { + uint32_t num_cpu = GOAMDSMI_VALUE_0; + uint32_t num_cpu_physicalCores = GOAMDSMI_VALUE_0; + uint32_t num_gpu_devices = GOAMDSMI_VALUE_0; + + //CPU + processor_type_t cpu_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU; + processor_type_t cpu_core_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE; + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], cpu_processor_type, nullptr, &num_cpu)) && + (GOAMDSMI_VALUE_0 != num_cpu) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], cpu_processor_type, &amdsmi_processor_handle_all_cpu_across_socket[num_cpu_inAllSocket], &num_cpu))) + { + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], cpu_core_processor_type, nullptr, &num_cpu_physicalCores)) && + (GOAMDSMI_VALUE_0 != num_cpu_physicalCores) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], cpu_core_processor_type, &amdsmi_processor_handle_all_cpu_physicalCore_across_socket[num_cpu_physicalCore_inAllSocket], &num_cpu_physicalCores))) + { + num_cpu_physicalCore_inAllSocket = num_cpu_physicalCore_inAllSocket+num_cpu_physicalCores; + } + num_cpu_inAllSocket = num_cpu_inAllSocket+num_cpu; + num_cpuSockets = num_cpuSockets+1; + } + + //GPU + processor_type_t gpu_device_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_GPU; + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], gpu_device_processor_type, nullptr, &num_gpu_devices)) && + (GOAMDSMI_VALUE_0 != num_gpu_devices) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_apusocket_handle_all_socket[socket_counter], gpu_device_processor_type, &amdsmi_processor_handle_all_gpu_device_across_socket[num_gpu_devices_inAllSocket], &num_gpu_devices))) + { + num_gpu_devices_inAllSocket = num_gpu_devices_inAllSocket+num_gpu_devices; + num_gpuSockets = num_gpuSockets+1; + } + } + } + } + else if(GOAMDSMI_CPU_INIT == goamdsmi_Init) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, Going to enumurate only CPU\n");} + cpuInitCompleted = true; + + if (GOAMDSMI_STATUS_SUCCESS == check_hsmp_driver()) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, Identified CPU Driver and going to enumurate only CPU\n");} + + if( (AMDSMI_STATUS_SUCCESS != amdsmi_init(AMDSMI_INIT_AMD_CPUS)) || + (AMDSMI_STATUS_SUCCESS != amdsmi_get_socket_handles(&num_cpuSockets, nullptr)) || + (AMDSMI_STATUS_SUCCESS != amdsmi_get_socket_handles(&num_cpuSockets, &amdsmi_cpusocket_handle_all_socket[0])) || + (GOAMDSMI_VALUE_0 == num_cpuSockets)) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, AMDSMICPUInit:0, CpuNumSockets=0\n");} + return GOAMDSMI_STATUS_FAILURE; + } + } + else + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_3)) {printf("AMDSMI, Status, Missing CPU Driver and not going to enumurate only CPU\n");} + } + //CPU + for(uint32_t cpu_socket_counter = 0; cpu_socket_counter < num_cpuSockets; cpu_socket_counter++) + { + uint32_t num_cpu = GOAMDSMI_VALUE_0; + uint32_t num_cpu_physicalCores = GOAMDSMI_VALUE_0; + + processor_type_t cpu_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU; + processor_type_t cpu_core_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE; + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_cpusocket_handle_all_socket[cpu_socket_counter], cpu_processor_type, nullptr, &num_cpu)) && + (GOAMDSMI_VALUE_0 != num_cpu) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_cpusocket_handle_all_socket[cpu_socket_counter], cpu_processor_type, &amdsmi_processor_handle_all_cpu_across_socket[num_cpu_inAllSocket], &num_cpu))) + { + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_cpusocket_handle_all_socket[cpu_socket_counter], cpu_core_processor_type, nullptr, &num_cpu_physicalCores)) && + (GOAMDSMI_VALUE_0 != num_cpu_physicalCores) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_cpusocket_handle_all_socket[cpu_socket_counter], cpu_core_processor_type, &amdsmi_processor_handle_all_cpu_physicalCore_across_socket[num_cpu_physicalCore_inAllSocket], &num_cpu_physicalCores))) + { + num_cpu_physicalCore_inAllSocket = num_cpu_physicalCore_inAllSocket+num_cpu_physicalCores; + } + num_cpu_inAllSocket = num_cpu_inAllSocket+num_cpu; + } + } + } + else if(GOAMDSMI_GPU_INIT == goamdsmi_Init) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, Going to enumurate only GPU\n");} + gpuInitCompleted = true; + + if (GOAMDSMI_STATUS_SUCCESS == check_amdgpu_driver()) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Status, Identified GPU Driver and going to enumurate only GPU\n");} + + if( (AMDSMI_STATUS_SUCCESS != amdsmi_init(AMDSMI_INIT_AMD_GPUS)) || + (AMDSMI_STATUS_SUCCESS != amdsmi_get_socket_handles(&num_gpuSockets, nullptr)) || + (AMDSMI_STATUS_SUCCESS != amdsmi_get_socket_handles(&num_gpuSockets, &amdsmi_gpusocket_handle_all_socket[0])) || + (GOAMDSMI_VALUE_0 == num_gpuSockets)) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, AMDSMIGPUInit:0, GpuNumSockets=0\n");} + return GOAMDSMI_STATUS_FAILURE; + } + } + else + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_3)) {printf("AMDSMI, Status, Missing GPU Driver and not going to enumurate only GPU\n");} + } + + //GPU + for(uint32_t gpu_socket_counter = 0; gpu_socket_counter < num_gpuSockets; gpu_socket_counter++) + { + uint32_t num_gpu_devices = GOAMDSMI_VALUE_0; + + processor_type_t gpu_device_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_GPU; + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_gpusocket_handle_all_socket[gpu_socket_counter], gpu_device_processor_type, nullptr, &num_gpu_devices)) && + (GOAMDSMI_VALUE_0 != num_gpu_devices) && + (AMDSMI_STATUS_SUCCESS == amdsmi_get_processor_handles_by_type(amdsmi_gpusocket_handle_all_socket[gpu_socket_counter], gpu_device_processor_type, &amdsmi_processor_handle_all_gpu_device_across_socket[num_gpu_devices_inAllSocket], &num_gpu_devices))) + { + num_gpu_devices_inAllSocket = num_gpu_devices_inAllSocket+num_gpu_devices; + } + } + } + + //CPU + if((GOAMDSMI_CPU_INIT == goamdsmi_Init) && ((GOAMDSMI_VALUE_0 == num_cpuSockets)||(GOAMDSMI_VALUE_0 == num_cpu_inAllSocket)||(GOAMDSMI_VALUE_0 == num_cpu_physicalCore_inAllSocket))) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, CPU Enumuration Failed AMDSMICPUInit:%d, CpuSocketCount:%d, CpuCount:%d, CpuPhysicalCoreCount:%d,\n", GOAMDSMI_STATUS_FAILURE, num_cpuSockets, num_cpu_inAllSocket, num_cpu_physicalCore_inAllSocket);} + return GOAMDSMI_STATUS_FAILURE; + } + + //GPU + if((GOAMDSMI_GPU_INIT == goamdsmi_Init) && ((GOAMDSMI_VALUE_0 == num_gpuSockets)||(GOAMDSMI_VALUE_0 == num_gpu_devices_inAllSocket))) + { + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed, GPU Enumuration Failed AMDSMIGPUInit:%d, GpuSocketCount:%d, GpuCount:%d\n", GOAMDSMI_STATUS_FAILURE, num_gpuSockets, num_gpu_devices_inAllSocket);} + return GOAMDSMI_STATUS_FAILURE; + } + + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) + { + if((GOAMDSMI_CPU_INIT == goamdsmi_Init) || apuInitCompleted) printf("AMDSMI, Status, AMDSMICPUInit:%d, CpuSocketCount:%d, CpuCount:%d, CpuPhysicalCoreCount:%d,\n", GOAMDSMI_STATUS_SUCCESS, num_cpuSockets, num_cpu_inAllSocket, num_cpu_physicalCore_inAllSocket); + if((GOAMDSMI_GPU_INIT == goamdsmi_Init) || apuInitCompleted) printf("AMDSMI, Status, AMDSMIGPUInit:%d, GpuSocketCount:%d, GpuCount:%d\n", GOAMDSMI_STATUS_SUCCESS, num_gpuSockets, num_gpu_devices_inAllSocket); + } + + return GOAMDSMI_STATUS_SUCCESS; +} +////////////////////////////////////////////////------------CPU------------//////////////////////////////////////////////// +bool goamdsmi_cpu_init() +{ + bool cpu_init_success = false; + if(GOAMDSMI_STATUS_SUCCESS == go_shim_amdsmiapu_init(GOAMDSMI_CPU_INIT)) + { + if((num_cpu_inAllSocket) && (num_cpu_physicalCore_inAllSocket)) cpu_init_success = true; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s, InitAMDSMICPUInit:%d, CpuSocketCount:%d, CpuCount:%d, CpuPhysicalCoreCount:%d,\n", cpu_init_success?"Success":"Failed", cpu_init_success?1:0, num_cpuSockets, num_cpu_inAllSocket, num_cpu_physicalCore_inAllSocket);} + return cpu_init_success; +} + +uint32_t goamdsmi_cpu_threads_per_core_get() +{ + bool readSuccess = false; + uint32_t threads_per_core_temp = GOAMDSMI_VALUE_0; + + if((AMDSMI_STATUS_SUCCESS == amdsmi_get_threads_per_core(&threads_per_core_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s, CpuThreadsPerCore:%lu\n", readSuccess?"Success":"Failed", (unsigned long)(threads_per_core_temp));} + + return threads_per_core_temp; +} + +uint32_t goamdsmi_cpu_number_of_threads_get() +{ + bool readSuccess = false; + uint32_t number_of_threads = GOAMDSMI_VALUE_0; + uint32_t num_threads_per_core = goamdsmi_cpu_threads_per_core_get(); + if(0 != num_threads_per_core) + { + readSuccess = true; + number_of_threads = num_cpu_physicalCore_inAllSocket*num_threads_per_core; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s, CpuNumThreads:%lu\n", readSuccess?"Success":"Failed", (unsigned long)(number_of_threads));} + return number_of_threads; +} + +uint32_t goamdsmi_cpu_number_of_sockets_get() +{ + uint32_t number_of_sockets = num_cpuSockets; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, CpuNumSockets:%lu\n", (unsigned long)(number_of_sockets));} + return number_of_sockets; +} + +uint64_t goamdsmi_cpu_core_energy_get(uint32_t thread_index) +{ + bool readSuccess = false; + uint64_t core_energy_temp = GOAMDSMI_UINT64_MAX; + uint32_t physicalCore_index = thread_index%num_cpu_physicalCore_inAllSocket; + + if (AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_core_energy(amdsmi_processor_handle_all_cpu_physicalCore_across_socket[physicalCore_index], &core_energy_temp)) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Thread:%d PC:%d, CpuCoreEnergy:%llu, CpuCoreEnergyJoules:%.6f, CpuCoreEnergyKJoules:%.9f\n", readSuccess?"Success":"Failed", thread_index, physicalCore_index, (unsigned long long)(core_energy_temp), ((double)(core_energy_temp))/1000000, ((double)(core_energy_temp))/1000000000);} + + return core_energy_temp; +} + +uint64_t goamdsmi_cpu_socket_energy_get(uint32_t socket_index) +{ + bool readSuccess = false; + uint64_t socket_energy_temp = GOAMDSMI_UINT64_MAX; + if ((AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_socket_energy(amdsmi_processor_handle_all_cpu_across_socket[socket_index], &socket_energy_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Socket:%d, CpuSocketEnergy:%llu, CpuSocketEnergyJoules:%.6f, CpuSocketEnergyKJoules:%.9f\n", readSuccess?"Success":"Failed", socket_index, (unsigned long long)(socket_energy_temp), ((double)(socket_energy_temp))/1000000, ((double)(socket_energy_temp))/1000000000);} + + return socket_energy_temp; +} + +uint32_t goamdsmi_cpu_prochot_status_get(uint32_t socket_index) +{ + bool readSuccess = false; + uint32_t prochot_temp = GOAMDSMI_UINT32_MAX; + if ((AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_prochot_status(amdsmi_processor_handle_all_cpu_across_socket[socket_index], &prochot_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Socket:%d, CpuProchotStatus:%lu\n", readSuccess?"Success":"Failed", socket_index, (unsigned long)(prochot_temp));} + + return prochot_temp; +} + +uint32_t goamdsmi_cpu_socket_power_get(uint32_t socket_index) +{ + bool readSuccess = false; + uint32_t socket_power_temp = GOAMDSMI_UINT32_MAX; + if ((AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_socket_power(amdsmi_processor_handle_all_cpu_across_socket[socket_index], &socket_power_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Socket:%d, CpuSocketPower:%lu, CpuSocketPowerWatt:%.3f\n", readSuccess?"Success":"Failed", socket_index, (unsigned long)(socket_power_temp), ((double)(socket_power_temp))/1000);} + + return socket_power_temp; +} + +uint32_t goamdsmi_cpu_socket_power_cap_get(uint32_t socket_index) +{ + bool readSuccess = false; + uint32_t socket_power_cap_temp = GOAMDSMI_UINT32_MAX; + if ((AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_socket_power_cap(amdsmi_processor_handle_all_cpu_across_socket[socket_index], &socket_power_cap_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Socket:%d, CpuSocketPowerCap:%lu, CpuSocketPowerCapWatt:%.3f\n", readSuccess?"Success":"Failed", socket_index, (unsigned long)(socket_power_cap_temp), ((double)(socket_power_cap_temp))/1000);} + + return socket_power_cap_temp; +} + +uint32_t goamdsmi_cpu_core_boostlimit_get(uint32_t thread_index) +{ + bool readSuccess = false; + uint32_t core_boostlimit_temp = GOAMDSMI_UINT32_MAX; + uint32_t physicalCore_index = thread_index%num_cpu_physicalCore_inAllSocket; + + if (AMDSMI_STATUS_SUCCESS == amdsmi_get_cpu_core_boostlimit(amdsmi_processor_handle_all_cpu_physicalCore_across_socket[physicalCore_index], &core_boostlimit_temp)) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Thread:%d PC:%d, CpuCoreBoostLimit:%lu\n", readSuccess?"Success":"Failed", thread_index, physicalCore_index, (unsigned long)(core_boostlimit_temp));} + + return core_boostlimit_temp; +} + +////////////////////////////////////////////////------------GPU------------//////////////////////////////////////////////// +bool goamdsmi_gpu_init() +{ + bool gpu_init_success = false; + if(GOAMDSMI_STATUS_SUCCESS == go_shim_amdsmiapu_init(GOAMDSMI_GPU_INIT)) + { + if((num_gpu_devices_inAllSocket)) gpu_init_success = true; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s, InitAMDSMIGPUInit:%d, GpuSocketCount:%d, GpuCount:%d\n", gpu_init_success?"Success":"Failed", gpu_init_success?1:0, num_gpuSockets, num_gpu_devices_inAllSocket);} + + return gpu_init_success; +} + +bool goamdsmi_gpu_shutdown() +{ + return false; +} + +uint32_t goamdsmi_gpu_num_monitor_devices() +{ + uint32_t gpu_num_monitor_devices = num_gpu_devices_inAllSocket; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success, GpuNumMonitorDevices:%lu\n", (unsigned long)(gpu_num_monitor_devices));} + return gpu_num_monitor_devices; +} + +char* goamdsmi_gpu_dev_name_get(uint32_t dv_ind) +{ + uint32_t len = 256; + char* dev_name = (char*)malloc(sizeof(char)*len);dev_name[0] = '\0'; + strcpy(dev_name, GOAMDSMI_STRING_NA); + + return dev_name; +} + +uint16_t goamdsmi_gpu_dev_id_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint16_t gpu_dev_id_temp = GOAMDSMI_UINT16_MAX; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_id(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], &gpu_dev_id_temp))) readSuccess = true; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuDevId:%d\n", readSuccess?"Success":"Failed", dv_ind, gpu_dev_id_temp);} + + return gpu_dev_id_temp; +} + +uint64_t goamdsmi_gpu_dev_pci_id_get(uint32_t dv_ind) +{ + uint64_t gpu_pci_id = GOAMDSMI_UINT64_MAX; + return gpu_pci_id; +} + +char* goamdsmi_gpu_dev_vendor_name_get(uint32_t dv_ind) +{ + uint32_t len = 256; + char* gpu_vendor_name = (char*)malloc(sizeof(char)*len);gpu_vendor_name[0] = '\0'; + strcpy(gpu_vendor_name, GOAMDSMI_STRING_NA); + + return gpu_vendor_name; +} + +char* goamdsmi_gpu_dev_vbios_version_get(uint32_t dv_ind) +{ + uint32_t len = 256; + char* vbios_version = (char*)malloc(sizeof(char)*len);vbios_version[0] = '\0'; + strcpy(vbios_version, GOAMDSMI_STRING_NA); + + return vbios_version; +} + +uint64_t goamdsmi_gpu_dev_power_cap_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_power_cap = GOAMDSMI_UINT64_MAX; + amdsmi_power_cap_info_t amdsmi_power_cap_info_temp = {0}; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_power_cap_info(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], GPU_SENSOR_0, &amdsmi_power_cap_info_temp))) + { + readSuccess = true; + gpu_power_cap = amdsmi_power_cap_info_temp.power_cap; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuPowerCap:%llu, GpuPowerCapInWatt:%.6f\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_power_cap), ((double)(gpu_power_cap))/1000000);} + return gpu_power_cap; +} + +uint64_t goamdsmi_gpu_dev_power_get(uint32_t dv_ind) +{ + uint64_t gpu_power = GOAMDSMI_UINT64_MAX; + uint64_t gpu_power_temp = GOAMDSMI_UINT64_MAX; + amdsmi_power_info_t amdsmi_power_info_temp = {0}; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_power_info(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], &amdsmi_power_info_temp))) + { + gpu_power_temp = amdsmi_power_info_temp.average_socket_power; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Success for Gpu:%d, GpuPowerAverage:%llu, GpuPowerAverageinWatt:%.6f\n", dv_ind, (unsigned long long)(gpu_power_temp), ((double)(gpu_power_temp))/1000000);} + + if(MAX_GPU_POWER_FROM_DRIVER == gpu_power_temp) + { + gpu_power_temp = amdsmi_power_info_temp.current_socket_power; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Success for Gpu:%d, GpuPowerCurrent:%llu, GpuPowerCurrentinWatt:%.6f\n", dv_ind, (unsigned long long)(gpu_power_temp), ((double)(gpu_power_temp))/1000000);} + } + gpu_power = gpu_power_temp; + gpu_power = (gpu_power)*1000000;//to maintain backward compatibity with old ROCM SMI + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success for Gpu:%d, GpuPower:%llu, GpuPowerinWatt:%.6f\n", dv_ind, (unsigned long long)(gpu_power), ((double)(gpu_power))/1000000);} + return gpu_power; + } + + amdsmi_gpu_metrics_t metrics = {0}; + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_metrics_info(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], &metrics))) + { + gpu_power_temp = metrics.average_socket_power; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Success for Gpu:%d, GpuPowerAverageFromMetrics:%llu, GpuPowerAverageFromMetricsinWatt:%.6f\n", dv_ind, (unsigned long long)gpu_power_temp, ((double)(gpu_power_temp))/1000000);} + + if(MAX_GPU_POWER_FROM_DRIVER == gpu_power_temp) + { + gpu_power_temp = metrics.current_socket_power; + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_2)) {printf("AMDSMI, Success for Gpu:%d, GpuPowerCurrentFromMetrics:%llu, GpuPowerCurrentFromMetricsinWatt:%.6f\n", dv_ind, (unsigned long long)gpu_power_temp, ((double)(gpu_power_temp))/1000000);} + } + gpu_power = gpu_power_temp; + gpu_power = (gpu_power)*1000000;//to maintain backward compatibity with old ROCM SMI + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Success for Gpu:%d, GpuPowerFromMetrics:%llu, GpuPowerFromMetricsinWatt:%.6f\n", dv_ind, (unsigned long long)(gpu_power), ((double)(gpu_power))/1000000);} + return gpu_power; + } + + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, Failed for Gpu:%d, GpuPower:%llu, GpuPowerinWatt:%.6f\n", dv_ind, (unsigned long long)(gpu_power), ((double)(gpu_power))/1000000);} + return gpu_power; +} + +uint64_t goamdsmi_gpu_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor, uint32_t metric) +{ + bool readSuccess = false; + uint64_t gpu_temperature = GOAMDSMI_UINT64_MAX; + uint64_t gpu_temperature_temp = GOAMDSMI_UINT64_MAX; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_temp_metric(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], sensor, metric, &gpu_temperature_temp))) + { + readSuccess = true; + gpu_temperature = gpu_temperature_temp; + gpu_temperature = (gpu_temperature)*1000;//to maintain backward compatibity with old ROCM SMI + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d Sensor:%d Metric:%d, GpuTemperature:%llu, GpuTemperatureInDegree:%.3f\n", readSuccess?"Success":"Failed", dv_ind, sensor, metric, (unsigned long long)(gpu_temperature), ((double)(gpu_temperature))/1000);} + } + return gpu_temperature; +} + +uint32_t goamdsmi_gpu_dev_overdrive_level_get(uint32_t dv_ind) +{ + uint32_t gpu_overdrive_level = GOAMDSMI_UINT32_MAX; + return gpu_overdrive_level; +} + +uint32_t goamdsmi_gpu_dev_mem_overdrive_level_get(uint32_t dv_ind) +{ + uint32_t gpu_mem_overdrive_level = GOAMDSMI_UINT32_MAX; + return gpu_mem_overdrive_level; +} + +uint32_t goamdsmi_gpu_dev_perf_level_get(uint32_t dv_ind) +{ + uint32_t gpu_perf = GOAMDSMI_UINT32_MAX; + return gpu_perf; +} + +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_sclk(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_sclk_freq = GOAMDSMI_UINT64_MAX; + amdsmi_frequencies_t freq = {0}; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_clk_freq(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_CLK_TYPE_SYS, &freq))) + { + readSuccess = true; + gpu_sclk_freq = freq.frequency[freq.current]; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuSclkFreq:%llu, GpuSclkFreqMhz:%.6f\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_sclk_freq), ((double)(gpu_sclk_freq))/1000000);} + + return gpu_sclk_freq; +} + +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_mclk(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_memclk_freq = GOAMDSMI_UINT64_MAX; + amdsmi_frequencies_t freq = {0}; + + if((dv_ind < num_gpu_devices_inAllSocket) && (AMDSMI_STATUS_SUCCESS == amdsmi_get_clk_freq(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_CLK_TYPE_MEM, &freq))) + { + readSuccess = true; + gpu_memclk_freq = freq.frequency[freq.current]; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuMclkFreq:%llu, GpuMclkFreqMhz:%.6f\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_memclk_freq), ((double)(gpu_memclk_freq))/1000000);} + + return gpu_memclk_freq; +} + +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_sclk(uint32_t dv_ind) +{ + uint64_t gpu_min_sclk = GOAMDSMI_UINT64_MAX; + return gpu_min_sclk; +} + +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_mclk(uint32_t dv_ind) +{ + uint64_t gpu_min_memclk = GOAMDSMI_UINT64_MAX; + return gpu_min_memclk; +} + +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_sclk(uint32_t dv_ind) +{ + uint64_t gpu_max_sclk = GOAMDSMI_UINT64_MAX; + return gpu_max_sclk; +} + +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_mclk(uint32_t dv_ind) +{ + uint64_t gpu_max_memclk = GOAMDSMI_UINT64_MAX; + return gpu_max_memclk; +} + +uint32_t goamdsmi_gpu_dev_gpu_busy_percent_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint32_t gpu_busy_percent = GOAMDSMI_UINT32_MAX; + amdsmi_engine_usage_t amdsmi_engine_usage_temp; + + if(AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_activity(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], &amdsmi_engine_usage_temp)) + { + readSuccess = true; + gpu_busy_percent = amdsmi_engine_usage_temp.gfx_activity; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuBusyPerc:%lu\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long)(gpu_busy_percent));} + + return gpu_busy_percent; +} + +uint64_t goamdsmi_gpu_dev_gpu_memory_busy_percent_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_memory_busy_percent = GOAMDSMI_UINT64_MAX; + uint64_t gpu_memory_usage_temp = GOAMDSMI_UINT64_MAX; + uint64_t gpu_memory_total_temp = GOAMDSMI_UINT64_MAX; + + if( (AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_memory_usage(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_MEM_TYPE_VRAM, &gpu_memory_usage_temp))&& + (AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_memory_total(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_MEM_TYPE_VRAM, &gpu_memory_total_temp))) + { + readSuccess = true; + gpu_memory_busy_percent = (uint64_t)(gpu_memory_usage_temp*100)/gpu_memory_total_temp; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuMemoryBusyPerc:%llu\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_memory_busy_percent));} + + return gpu_memory_busy_percent; +} + +uint64_t goamdsmi_gpu_dev_gpu_memory_usage_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_memory_usage = GOAMDSMI_UINT64_MAX; + uint64_t gpu_memory_usage_temp = GOAMDSMI_UINT64_MAX; + + if(AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_memory_usage(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_MEM_TYPE_VRAM, &gpu_memory_usage_temp)) + { + readSuccess = true; + gpu_memory_usage = (uint64_t)gpu_memory_usage_temp; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuMemoryUsage:%llu\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_memory_usage));} + + return gpu_memory_usage; +} + +uint64_t goamdsmi_gpu_dev_gpu_memory_total_get(uint32_t dv_ind) +{ + bool readSuccess = false; + uint64_t gpu_memory_total = GOAMDSMI_UINT64_MAX; + uint64_t gpu_memory_total_temp = GOAMDSMI_UINT64_MAX; + + if(AMDSMI_STATUS_SUCCESS == amdsmi_get_gpu_memory_total(amdsmi_processor_handle_all_gpu_device_across_socket[dv_ind], AMDSMI_MEM_TYPE_VRAM, &gpu_memory_total_temp)) + { + readSuccess = true; + gpu_memory_total = (uint64_t)gpu_memory_total_temp; + } + if (enable_debug_level(GOAMDSMI_DEBUG_LEVEL_1)) {printf("AMDSMI, %s for Gpu:%d, GpuMemoryTotal:%llu\n", readSuccess?"Success":"Failed", dv_ind, (unsigned long long)(gpu_memory_total));} + + return gpu_memory_total; +} +#else +////////////////////////////////////////////////------------CPU------------//////////////////////////////////////////////// +bool goamdsmi_cpu_init() {return false;} +uint32_t goamdsmi_cpu_threads_per_core_get() {return GOAMDSMI_VALUE_0;} +uint32_t goamdsmi_cpu_number_of_threads_get() {return GOAMDSMI_VALUE_0;} +uint32_t goamdsmi_cpu_number_of_sockets_get() {return GOAMDSMI_VALUE_0;} +uint64_t goamdsmi_cpu_core_energy_get(uint32_t thread_index) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_cpu_socket_energy_get(uint32_t socket_index) {return GOAMDSMI_UINT64_MAX;} +uint32_t goamdsmi_cpu_prochot_status_get(uint32_t socket_index) {return GOAMDSMI_UINT32_MAX;} +uint32_t goamdsmi_cpu_socket_power_get(uint32_t socket_index) {return GOAMDSMI_UINT32_MAX;} +uint32_t goamdsmi_cpu_socket_power_cap_get(uint32_t socket_index) {return GOAMDSMI_UINT32_MAX;} +uint32_t goamdsmi_cpu_core_boostlimit_get(uint32_t thread_index) {return GOAMDSMI_UINT32_MAX;} + +////////////////////////////////////////////////------------GPU------------//////////////////////////////////////////////// +bool goamdsmi_gpu_init() {return false;} +bool goamdsmi_gpu_shutdown() {return false;} +uint32_t goamdsmi_gpu_num_monitor_devices() {return GOAMDSMI_VALUE_0;} +char* goamdsmi_gpu_dev_name_get(uint32_t dv_ind) {return NULL;} +uint16_t goamdsmi_gpu_dev_id_get(uint32_t dv_ind) {return GOAMDSMI_UINT16_MAX;} +uint64_t goamdsmi_gpu_dev_pci_id_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +char* goamdsmi_gpu_dev_vendor_name_get(uint32_t dv_ind) {return NULL;} +char* goamdsmi_gpu_dev_vbios_version_get(uint32_t dv_ind) {return NULL;} +uint64_t goamdsmi_gpu_dev_power_cap_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_dev_power_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor, uint32_t metric) {return GOAMDSMI_UINT64_MAX;} +uint32_t goamdsmi_gpu_dev_overdrive_level_get(uint32_t dv_ind) {return GOAMDSMI_UINT32_MAX;} +uint32_t goamdsmi_gpu_dev_mem_overdrive_level_get(uint32_t dv_ind) {return GOAMDSMI_UINT32_MAX;} +uint32_t goamdsmi_gpu_dev_perf_level_get(uint32_t dv_ind) {return GOAMDSMI_UINT32_MAX;} +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_sclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_mclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_sclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_mclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_sclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_mclk(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint32_t goamdsmi_gpu_dev_gpu_busy_percent_get(uint32_t dv_ind) {return GOAMDSMI_UINT32_MAX;} +uint64_t goamdsmi_gpu_dev_gpu_memory_busy_percent_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_dev_gpu_memory_usage_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +uint64_t goamdsmi_gpu_dev_gpu_memory_total_get(uint32_t dv_ind) {return GOAMDSMI_UINT64_MAX;} +#endif diff --git a/goamdsmi_shim/smiwrapper/amdsmi_go_shim.h b/goamdsmi_shim/smiwrapper/amdsmi_go_shim.h new file mode 100644 index 0000000000..14457d1a95 --- /dev/null +++ b/goamdsmi_shim/smiwrapper/amdsmi_go_shim.h @@ -0,0 +1,574 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sellcopies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Except as contained in this notice, the name of the Advanced Micro Devices, + * Inc. shall not be used in advertising or otherwise to promote the sale, use + * or other dealings in this Software without prior written authorization from + * the Advanced Micro Devices, Inc. + * + */ + +#include "goamdsmi.h" +////////////////////////////////////////////////------------CPU------------//////////////////////////////////////////////// +/** + * @brief Go language stub to initialize the AMDSMI library + * + * @retval ::bool value of true upon success + * @retval false is returned upon failure. + * + */ +bool goamdsmi_cpu_init(); + +/** + * @brief Go language stub to get the core energy for a given core + * + * @details Given a core index @p num, this function will call the + * esmi_core_energy_get() function to update the @p penergy in micro Joules. + * This value is then passed as a uint64_t val to the Go routine that called it. + * + * @param[in] num is the core index + * + * @retval ::uint64_t value of the penergy in micro Joules. + * @retval zero is returned upon failure. + * + */ +uint64_t goamdsmi_cpu_core_energy_get(uint32_t num); + +/** + * @brief Go language stub to get the socket energy for a given socket + * + * @details Given a socket index @p socket_idx, this function will call the + * esmi_socket_energy_get() function to get the socket energy counter of an + * online cpu in that socket. This value is then passed as a uint64_t val to + * the Go routine that called it. + * + * @param[in] socket_idx is the socket index + * + * @retval ::uint64_t value of the socket energy counter + * @retval zero is returned upon failure. + * + */ +uint64_t goamdsmi_cpu_socket_energy_get(uint32_t socket_idx); + +/** + * @brief Go language stub to get normalized status of + * the processor's PROCHOT status. + * 1 - PROCHOT active, 0 - PROCHOT inactive + * + * @details Given a socket index @p socket_idx and this function will get + * PROCHOT at @p prochot. + * + * @param[in] socket_idx a socket index + * + * @retval ::uint32_t value of the prochot status + * @retval -1 is returned upon failure or if status is inactive. + * + */ +uint32_t goamdsmi_cpu_prochot_status_get(uint32_t socket_idx); + +/** + * @brief Go language stub to get the instantaneous power + * consumption of the provided socket. + * + * @details Given a socket index @p sock_ind this function will + * get the current power consumption (in milliwatts). + * + * @param[in] sock_ind a socket index + * + * @retval ::uint32_t value of the socket power + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_cpu_socket_power_get(uint32_t sock_ind); + +/** + * @brief Go language stub to get the current power cap value + * for a given socket. + * + * @details This function will return the valid power cap @p pcap for a given + * socket @p sock_ind, this value will be used by the system to limit + * the power usage (in milliwatts). + * + * @param[in] sock_ind a socket index + * + * @retval ::uint32_t value of the socket power cap + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_cpu_socket_power_cap_get(uint32_t sock_ind); + +/** + * @brief Go language stub to get the boostlimit value for a given core + * + * @details This function will return the core's current boost limit + * @p boostlimit for a particular @p socket + * + * @param[in] socket a socket index + * + * @retval ::uint32_t value of the boostlimit + * @retval -1 is returned upon failure.. + * + */ +uint32_t goamdsmi_cpu_core_boostlimit_get(uint32_t socket); + +/** + * @brief Go stub to get the number of threads per core in the system + * + * @retval ::Number of threads per core + * @retval Zero is returned upon failure. + */ +uint32_t goamdsmi_cpu_threads_per_core_get(); + +/** + * @brief Go stub to get the number of threads available in the system + * + * @retval ::Number of threads + * @retval Zero is returned upon failure. + */ +uint32_t goamdsmi_cpu_number_of_threads_get(); + +/** + * @brief Go stub to get the total number of processor sockets + * available in the system + * + * @retval ::Number of threads per core + * @retval Zero is returned upon failure. + */ +uint32_t goamdsmi_cpu_threads_per_core_get(); + +/** + * @brief Go stub to get the number of threads available in the system + * + * @retval ::Number of threads + * @retval Zero is returned upon failure. + */ +uint32_t goamdsmi_cpu_number_of_threads_get(); + +/** + * @brief Go stub to get the total number of processor sockets + * available in the system + * + * @retval ::uint32_t value of the socket number + * @retval Zero is returned upon failure. + */ +uint32_t goamdsmi_cpu_number_of_sockets_get(); + +////////////////////////////////////////////////------------GPU------------//////////////////////////////////////////////// +/** + * @brief Go language stub to initialize the ROCm-SMI library + * + * @retval ::bool value of true upon success + * @retval false is returned upon failure. + * + */ +bool goamdsmi_gpu_init(); + +/** + * @brief Go language stub to shut down the ROCm-SMI library + * and do necessary clean up + * + * @retval ::bool value of true upon success + * @retval false is returned upon failure. + * + */ +bool goamdsmi_gpu_shutdown(); + +/** + * @brief Go language stub to get the number of GPU devices + * + * @details This function will call the rsmi_num_monitor_devices() + * function to return the number of GPU devices to be monitored. + * This value is then passed as a uint val to the Go routine that + * called it. + * + * @retval ::uint32_t value of num GPUs + * @retval zero is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_num_monitor_devices(); + +/** + * @brief Go language stub to get the gpu device name string + * + * @details This function will call the rsmi_dev_name_get() + * function to write the gpu device name string (up to len characters) + * for device dv_ind and return a char pointer. This value is then + * passed as char * to the Go routine that called it. The caller of this + * function must free the allocated buffer for the device name. + * + * @param[in] ::uint32_t device index + * + * @retval ::char* VBIOS identifier + * @retval NA is returned upon failure. + * + */ +char* goamdsmi_gpu_dev_name_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU device id + * + * @details This function will call the rsmi_dev_id_get() + * function to return the GPU device id. This value is then + * passed as a uint16_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint16_t value of num GPUs + * @retval -1 is returned upon failure. + * + */ +uint16_t goamdsmi_gpu_dev_id_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU unique pci id + * + * @details This function will call the rsmi_dev_pci_id_get() + * function to return the unique PCI device identifier + * associated for a device. This value is then passed as + * a uint64_t val to the Go routine that called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t value of pci id + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_pci_id_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the VBIOS identifier string + * + * @details This function will call the rsmi_dev_vbios_ver_get() + * function to write the VBIOS char array (up to len characters) + * for device dv_ind and return a char pointer. This value is then + * passed as char pointer to the Go routine that called it. The caller + * of this funcion must free the allocated buffer for the vbios + * identifier + * + * @param[in] ::uint32_t device index + * @param[in] ::char* vbios buffer of length + * + * @retval ::char* VBIOS identifier + * @retval NA is returned upon failure + * + */ +char* goamdsmi_gpu_dev_vbios_version_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the vendor + * + * @details This function will call the rsmi_dev_vendor_name_get() + * function to write the name of the vendor char array (up to len + * characters) for a device dv_ind and return a char pointer. This + * value is then passed as a char pointer to the Go routine that + * called it. The caller of this funcion must free the allocated + * buffer for the vbios identifier + * + * @param[in] ::uint32_t device index + * + * @retval ::char* vendor name + * @retval NA is returned upon failure. + * + */ +char* goamdsmi_gpu_dev_vendor_name_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU power cap + * + * @details This function will call the rsmi_dev_power_cap_get() + * function to return the gpu power cap. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t GPU power cap + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_power_cap_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU power + * + * @details This function will call the rsmi_dev_power_get() + * function to return the gpu power. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t GPU power + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_power_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU current temperature + * + * @details This function will call the rsmi_dev_temp_metric_get() + * function to return the gpu current temperature. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, uint32_t sensor, uint32_t metric + * + * @retval ::uint64_t GPU current temperature + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor, uint32_t metric); + +/** + * @brief Go language stub to get the overdrive level of the device + * + * @details This function will call the rsmi_dev_overdrive_level_get() + * function to return the overdrive percentage. This value is then + * passed as a uint32_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint32_t overdrive level + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_dev_overdrive_level_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the memory overdrive level of the device + * + * @details This function will call the rsmi_dev_mem_overdrive_level_get() + * function to return the memory overdrive percentage. This value is then + * passed as a uint32_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint32_t memory overdrive level + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_dev_mem_overdrive_level_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the performance level of the device + * + * @details This function will call the rsmi_dev_perf_level_get() + * function to return the rsmi_dev_perf_level_t. This value is then + * passed as a uint32_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint32_t performance level (rsmi_dev_perf_level_t) + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_dev_perf_level_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU SCLK limit + * + * @details This function will call the rsmi_dev_gpu_clk_freq_get() + * function to return the gpu SCLK Limit. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint64_t GPU SCLK Limit + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_sclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU MCLK limit + * + * @details This function will call the rsmi_dev_gpu_clk_freq_get() + * function to return the gpu MCLK Limit. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint64_t GPU MCLK Limit + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_gpu_clk_freq_get_mclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the minimum supported SCLK frequency + * + * @details This function will call the rsmi_od_volt_freq_data_get() + * function to return the minium supported SCLK frequency. + * This value is then passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t mimimum supported sclk frequency + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_sclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the minimum supported MCLK frequency + * + * @details This function will call the rsmi_od_volt_freq_data_get() + * function to return the minium supported MCLK frequency. + * This value is then passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t mimimum supported mclk sfrequency + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_od_volt_freq_range_min_get_mclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the maximum supported SCLK frequency + * + * @details This function will call the rsmi_od_volt_freq_data_get() + * function to return the maxium supported SCLK frequency. + * This value is then passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t maximum supported sclk frequency + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_sclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the maximum supported MCLK frequency + * + * @details This function will call the rsmi_od_volt_freq_data_get() + * function to return the maxium supported MCLK frequency. + * This value is then passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index + * + * @retval ::uint64_t maximum supported mclk sfrequency + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_od_volt_freq_range_max_get_mclk(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU Activity + * + * @details This function will call the rsmi_dev_gpu_activity_get() + * function to return the current GPU use. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint32_t GPU Activity use + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_dev_gpu_busy_percent_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU Memory Use percent + * + * @details This function will call the rsmi_dev_memory_busy_percent_get() + * function to return the current device memory use percent. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint32_t GPU Activity use + * @retval -1 is returned upon failure. + * + */ +uint32_t goamdsmi_gpu_dev_gpu_busy_percent_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU Memory Use percent + * + * @details This function will call the rsmi_dev_memory_busy_percent_get() + * function to return the current device memory use percent. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint64_t GPU memory use percent + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_gpu_memory_busy_percent_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the GPU Memory Usage + * + * @details This function will call the rsmi_dev_memory_usage_get() + * function to return the amount of memory currently being used. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint64_t GPU memory usage + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_gpu_memory_usage_get(uint32_t dv_ind); + +/** + * @brief Go language stub to get the Total amount of GPU Memory + * + * @details This function will call the rsmi_dev_memory_total_get() + * function to return the total amount of memory. This value is then + * passed as a uint64_t val to the Go routine that + * called it. + * + * @param[in] ::uint32_t device index, flag, ptr to rsmi_frequencies_t + * + * @retval ::uint64_t Total GPU memory + * @retval -1 is returned upon failure. + * + */ +uint64_t goamdsmi_gpu_dev_gpu_memory_total_get(uint32_t dv_ind); diff --git a/goamdsmi_shim/smiwrapper/goamdsmi.h b/goamdsmi_shim/smiwrapper/goamdsmi.h new file mode 100644 index 0000000000..cad497285c --- /dev/null +++ b/goamdsmi_shim/smiwrapper/goamdsmi.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sellcopies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Except as contained in this notice, the name of the Advanced Micro Devices, + * Inc. shall not be used in advertising or otherwise to promote the sale, use + * or other dealings in this Software without prior written authorization from + * the Advanced Micro Devices, Inc. + * + */ + +#ifndef GO_AMD_SMI_H_ +#define GO_AMD_SMI_H_ + +#include +#include + +#define GOAMDSMI_VALUE_0 0 +#define GOAMDSMI_UINT16_MAX 0xFFFF +#define GOAMDSMI_UINT32_MAX 0xFFFFFFFF +#define GOAMDSMI_UINT64_MAX 0xFFFFFFFFFFFFFFFF +#define GOAMDSMI_STRING_NA "NA" + +/** + * @brief Go language stub to initialize the Debug Level prints + * -DENABLE_DEBUG_LEVEL=1 (or) -DENABLE_DEBUG_LEVEL= must be passed at cmake time + * + * @retval ::bool value of true upon enabling logs + * @retval false is returned upon if user does not want to enable logs. + * + */ +#define enable_debug_level(debug_level) ((ENABLE_DEBUG_LEVEL >= debug_level)?true:false) + +typedef enum { + GOAMDSMI_STATUS_SUCCESS = 0x0, //!< Operation successful + GOAMDSMI_STATUS_FAILURE = 0x1, //!< Operation failed +} goamdsmi_status_t; + +typedef enum { + GOAMDSMI_CPU_INIT = 0x0, //!< CPU Init + GOAMDSMI_GPU_INIT = 0x1, //!< GPU Init +} goamdsmi_Init_t; + +typedef enum { + GOAMDSMI_DEBUG_LEVEL_0 = 0x0, //!< Debug Level as 0 + GOAMDSMI_DEBUG_LEVEL_1 = 0x1, //!< Debug Level as 1 + GOAMDSMI_DEBUG_LEVEL_2 = 0x2, //!< Debug Level as 2 + GOAMDSMI_DEBUG_LEVEL_3 = 0x3, //!< Debug Level as 3 +} goamdsmi_Enable_Debug_Level_t; + +#endif diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 9e5b988614..4326f47115 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -87,6 +87,9 @@ typedef enum { #define AMDSMI_MAX_CONTAINER_TYPE 2 #define AMDSMI_MAX_CACHE_TYPES 10 #define AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK 64 +#define AMDSMI_MAX_ACCELERATOR_PROFILE 32 +#define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 +#define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 #define AMDSMI_GPU_UUID_SIZE 38 @@ -154,7 +157,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_MAJOR 6 //! Minor version should be updated for each API change, but without changing headers -#define AMDSMI_LIB_VERSION_MINOR 3 +#define AMDSMI_LIB_VERSION_MINOR 5 //! Release version should be set to 0 as default and can be updated by the PMs for each CSP point release #define AMDSMI_LIB_VERSION_RELEASE 0 @@ -275,22 +278,40 @@ typedef enum { AMDSMI_CLK_TYPE__MAX = AMDSMI_CLK_TYPE_DCLK1 } amdsmi_clk_type_t; +/** + * @brief Accelerator Partition. This enum is used to identify + * various accelerator partitioning settings. + */ +typedef enum { + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory +} amdsmi_accelerator_partition_type_t; + /** * @brief Compute Partition. This enum is used to identify * various compute partitioning settings. */ typedef enum { AMDSMI_COMPUTE_PARTITION_INVALID = 0, - AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - AMDSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - AMDSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - AMDSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - AMDSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory } amdsmi_compute_partition_type_t; /** @@ -589,9 +610,46 @@ typedef struct { char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported - uint32_t reserved[17]; + uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t reserved[15]; } amdsmi_asic_info_t; +typedef struct { + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[13]; +} amdsmi_kfd_info_t; + +/** + * @brief Possible Memory Partition Modes. + * This union is used to identify various memory partitioning settings. + */ +typedef union { + struct nps_flags_ { + uint32_t nps1_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps2_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps4_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps8_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t reserved :28; + } amdsmi_nps_flags_t; + + uint32_t nps_cap_mask; +} amdsmi_nps_caps_t; + +/** + * @brief Possible Memory Partition Modes. + * This union is used to identify various memory partitioning settings. + */ +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, length of resources array + uint32_t profile_index; + amdsmi_nps_caps_t memory_caps; // Possible memory partition capabilities + uint32_t num_resources; // length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint64_t reserved[6]; +} amdsmi_accelerator_partition_profile_t; + typedef enum { AMDSMI_LINK_TYPE_PCIE, AMDSMI_LINK_TYPE_XGMI, @@ -690,6 +748,17 @@ typedef struct { uint32_t reserved[4]; } amdsmi_proc_info_t; +/** + * @brief IO Link P2P Capability + */ +typedef struct { + uint8_t is_iolink_coherent; // 1 = true, 0 = false, UINT8_MAX = Not defined. + uint8_t is_iolink_atomics_32bit; + uint8_t is_iolink_atomics_64bit; + uint8_t is_iolink_dma; + uint8_t is_iolink_bi_directional; +} amdsmi_p2p_capability_t; + //! Guaranteed maximum possible number of supported frequencies #define AMDSMI_MAX_NUM_FREQUENCIES 33 @@ -2222,16 +2291,18 @@ amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, * * The format of @p bdfid will be as follows: * - * BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) | - * ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7) + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((Partition & 0xF) << 28) + * | ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) + * | (FUNCTION & 0x7) * - * | Name | Field | - * ---------- | ------- | - * | Domain | [64:32] | - * | Reserved | [31:16] | - * | Bus | [15: 8] | - * | Device | [ 7: 3] | - * | Function | [ 2: 0] | + * | Name | Field | KFD property KFD -> PCIe ID (uint64_t) + * -------------- | ------- | ---------------- | ---------------------------- | + * | Domain | [63:32] | "domain" | (DOMAIN & 0xFFFFFFFF) << 32 | + * | Partition id | [31:28] | "location id" | (LOCATION & 0xF0000000) | + * | Reserved | [27:16] | "location id" | N/A | + * | Bus | [15: 8] | "location id" | (LOCATION & 0xFF00) | + * | Device | [ 7: 3] | "location id" | (LOCATION & 0xF8) | + * | Function | [ 2: 0] | "location id" | (LOCATION & 0x7) | * * @param[in] processor_handle a processor handle * @@ -4283,6 +4354,36 @@ amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst, bool *accessible); + +/** + * @brief Retrieve connection type and P2P capabilities between 2 GPUs + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * + * @details Given a source processor handle @p processor_handle_src and + * a destination processor handle @p processor_handle_dst, a pointer to an amdsmi_io_link_type_t @p type, + * and a pointer to amdsmi_p2p_capability_t @p cap. This function will write the connection type, + * and io link capabilities between the device + * @p processor_handle_src and @p processor_handle_dst to the memory + * pointed to by @p cap and @p type. + * + * @param[in] processor_handle_src the source processor handle + * + * @param[in] processor_handle_dst the destination processor handle + * + * @param[in,out] type A pointer to an ::amdsmi_io_link_type_t to which the + * type for the connection should be written. + * + * @param[in,out] type A pointer to an ::amdsmi_p2p_capability_t to which the + * io link capabilities should be written. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_topo_get_p2p_status(amdsmi_processor_handle processor_handle_src, + amdsmi_processor_handle processor_handle_dst, + amdsmi_io_link_type_t *type, amdsmi_p2p_capability_t *cap); + /** @} End HWTopo */ /*****************************************************************************/ @@ -4466,6 +4567,23 @@ amdsmi_status_t amdsmi_reset_gpu_memory_partition(amdsmi_processor_handle proces /** @} */ // end of memory_partition +/*****************************************************************************/ +/** @defgroup accelerator_partition_profile Accelerator Partition Profile Functions + * These functions are used to configure and query the device's + * accelerator parition profile setting. + * @{ + */ +// TODO: declare rest of partition profile functions and complete doc commentary. +/* + Get the current accelerator partition profile. The function will return current profile. +*/ +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_t *profile, + uint32_t *partition_id); + +/** @} */ // end of accelerator_partition_profile + /*****************************************************************************/ /** @defgroup EvntNotif Event Notification Functions * These functions are used to configure for and get asynchronous event @@ -4669,6 +4787,25 @@ amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_handle, amdsmi_driv amdsmi_status_t amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info); +/** + * @brief Returns the KFD (Kernel Fusion Driver) information for the device + * + * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf} + * + * @details This function returns KFD information populated into the amdsmi_kfd_info_t. + * This contains the kfd_id and node_id which allow for the ID and + * index of this device in the KFD. + * + * @param[in] processor_handle Device which to query + * + * @param[out] info Reference to kfd information structure. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle, amdsmi_kfd_info_t *info); + /** * @brief Returns vram info * diff --git a/include/amd_smi/impl/amd_smi_common.h b/include/amd_smi/impl/amd_smi_common.h index 6d4be4b364..9b85445243 100644 --- a/include/amd_smi/impl/amd_smi_common.h +++ b/include/amd_smi/impl/amd_smi_common.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_drm.h b/include/amd_smi/impl/amd_smi_drm.h index 4f25838ad8..7f81aad4b6 100644 --- a/include/amd_smi/impl/amd_smi_drm.h +++ b/include/amd_smi/impl/amd_smi_drm.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index b50159e89c..5c7c928071 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_processor.h b/include/amd_smi/impl/amd_smi_processor.h index 8188438e96..baafc2d77f 100644 --- a/include/amd_smi/impl/amd_smi_processor.h +++ b/include/amd_smi/impl/amd_smi_processor.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_socket.h b/include/amd_smi/impl/amd_smi_socket.h index 282859cbea..3011585329 100644 --- a/include/amd_smi/impl/amd_smi_socket.h +++ b/include/amd_smi/impl/amd_smi_socket.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_system.h b/include/amd_smi/impl/amd_smi_system.h index 18f93e7ab1..3c0e544a23 100644 --- a/include/amd_smi/impl/amd_smi_system.h +++ b/include/amd_smi/impl/amd_smi_system.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index 712767287c..af270252c5 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 Advanced Micro Devices. All rights reserved. + * Copyright (C) 2024 Advanced Micro Devices. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in diff --git a/include/amd_smi/impl/amd_smi_uuid.h b/include/amd_smi/impl/amd_smi_uuid.h index 0027936473..6b8e37c3a6 100644 --- a/include/amd_smi/impl/amd_smi_uuid.h +++ b/include/amd_smi/impl/amd_smi_uuid.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/py-interface/README.md b/py-interface/README.md index 7a454b1025..dc4001403e 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -377,6 +377,8 @@ Field | Content `rev_id` | revision id `asic_serial` | asic serial `oam_id` | oam id +`num_of_compute_units` | number of compute units on asic +`target_graphics_version` | hardware graphics version Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function: @@ -394,13 +396,44 @@ try: else: for device in devices: asic_info = amdsmi_get_gpu_asic_info(device) - print(asic_info['market_name']) - print(hex(asic_info['vendor_id'])) - print(asic_info['vendor_name']) - print(hex(asic_info['device_id'])) - print(hex(asic_info['rev_id'])) - print(asic_info['asic_serial']) - print(asic_info['oam_id']) + print(asic_info) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_gpu_kfd_info + +Description: Returns KFD(kernel fusion driver) information for the given GPU +This correlates to GUID in rocm-smi + +Input parameters: + +* `processor_handle` device which to query + +Output: Dictionary with fields + +Field | Content +---|--- +`kfd_id` | KFD's unique GPU identifier +`node_id` | KFD's internal GPU index + +Exceptions that can be thrown by `amdsmi_get_gpu_kfd_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + kfd_info = amdsmi_get_gpu_kfd_info(device) + print(kfd_info) except AmdSmiException as e: print(e) ``` @@ -810,7 +843,7 @@ except AmdSmiException as e: ### amdsmi_get_pcie_info -Description: Returns the pcie metric and static information for the given GPU. +Description: Returns the pcie metric and static information for the given GPU. For accurate PCIe Bandwidth measurements it is recommended to use this function once per 1000ms It is not supported on virtual machine guest Input parameters: @@ -1925,19 +1958,19 @@ except AmdSmiException as e: ### amdsmi_get_utilization_count -Description: Get coarse grain utilization counter of the specified device +Description: Get coarse/fine grain utilization counter of the specified device Input parameters: * `processor_handle` handle for the given device -* `counter_types` variable number of counter types desired +* `counter_types` List of AmdSmiUtilizationCounterType counters requested Output: List containing dictionaries with fields Field | Description ---|--- `timestamp` | The timestamp when the counter is retreived - Resolution: 1 ns -`Dictionary for each counter` |
Subfield Description
`type`Type of utilization counter
`value`Value gotten for utilization counter
+`Dictionary for each counter` |
Subfield Description
`type`Counter that was requested
`value`Value gotten for utilization counter
Exceptions that can be thrown by `amdsmi_get_utilization_count` function: @@ -1957,13 +1990,17 @@ try: utilization = amdsmi_get_utilization_count( device, AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY - ) + ) print(utilization) utilization = amdsmi_get_utilization_count( device, - AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, - AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY - ) + [AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY, + AmdSmiUtilizationCounterType.COARSE_GRAIN_MEM_ACTIVITY, + AmdSmiUtilizationCounterType.COARSE_DECODER_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_GRAIN_GFX_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_GRAIN_MEM_ACTIVITY, + AmdSmiUtilizationCounterType.FINE_DECODER_ACTIVITY] + ) print(utilization) except AmdSmiException as e: print(e) @@ -2065,6 +2102,7 @@ except AmdSmiException as e: ``` ### amdsmi_set_gpu_process_isolation + Description: Enable/disable the system Process Isolation for the given device handle. Input parameters: @@ -2095,6 +2133,7 @@ except AmdSmiException as e: ``` ### amdsmi_clean_gpu_local_data + Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak. Input parameters: @@ -2123,7 +2162,6 @@ except AmdSmiException as e: print(e) ``` - ### amdsmi_get_gpu_overdrive_level Description: Get the overdrive percent associated with the device with provided @@ -3789,6 +3827,44 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_accelerator_partition_profile + +**Note: CURRENTLY HARDCODED TO RETURN EMPTY VALUES** + +Description: Get partition information for target device + +Input parameters: + +* `processor_handle` the device handle + +Output: Dictionary with fields: + +Field | Description +---|--- +`partition_id` | ID of the partition on the GPU provided +`partition_profile` | Dict containing partition data (TBD) + +Exceptions that can be thrown by `amdsmi_get_gpu_accelerator_partition_profile` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + partition_id = amdsmi_get_gpu_accelerator_partition_profile(device)["partition_id"] + print(partition_id) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_get_xgmi_info Description: Returns XGMI information for the GPU. diff --git a/py-interface/__init__.py b/py-interface/__init__.py index 5e208aadcc..e0ffcd2c28 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -89,6 +89,7 @@ from .amdsmi_interface import amdsmi_get_gpu_driver_info # # ASIC and Bus Static Information from .amdsmi_interface import amdsmi_get_gpu_asic_info +from .amdsmi_interface import amdsmi_get_gpu_kfd_info from .amdsmi_interface import amdsmi_get_power_cap_info from .amdsmi_interface import amdsmi_get_gpu_vram_info from .amdsmi_interface import amdsmi_get_gpu_cache_info @@ -124,6 +125,7 @@ from .amdsmi_interface import amdsmi_set_gpu_pci_bandwidth from .amdsmi_interface import amdsmi_set_power_cap from .amdsmi_interface import amdsmi_set_gpu_power_profile from .amdsmi_interface import amdsmi_set_gpu_clk_range +from .amdsmi_interface import amdsmi_set_gpu_clk_limit from .amdsmi_interface import amdsmi_set_gpu_od_clk_info from .amdsmi_interface import amdsmi_set_gpu_od_volt_info from .amdsmi_interface import amdsmi_set_gpu_perf_level @@ -211,6 +213,7 @@ from .amdsmi_interface import amdsmi_topo_get_numa_node_number from .amdsmi_interface import amdsmi_topo_get_link_weight from .amdsmi_interface import amdsmi_get_minmax_bandwidth_between_processors from .amdsmi_interface import amdsmi_topo_get_link_type +from .amdsmi_interface import amdsmi_topo_get_p2p_status from .amdsmi_interface import amdsmi_is_P2P_accessible from .amdsmi_interface import amdsmi_get_xgmi_info @@ -221,6 +224,7 @@ from .amdsmi_interface import amdsmi_reset_gpu_compute_partition from .amdsmi_interface import amdsmi_get_gpu_memory_partition from .amdsmi_interface import amdsmi_set_gpu_memory_partition from .amdsmi_interface import amdsmi_reset_gpu_memory_partition +from .amdsmi_interface import amdsmi_get_gpu_accelerator_partition_profile # # Individual GPU Metrics Functions from .amdsmi_interface import amdsmi_get_gpu_metrics_header_info diff --git a/py-interface/amdsmi_exception.py b/py-interface/amdsmi_exception.py index e77e9a9d88..ab7c33e576 100644 --- a/py-interface/amdsmi_exception.py +++ b/py-interface/amdsmi_exception.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index a7214776b4..cae4a2d1b7 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -575,7 +575,7 @@ def _make_amdsmi_bdf_from_list(bdf): amdsmi_bdf.struct_amdsmi_bdf_t.domain_number = bdf[0] return amdsmi_bdf -def _padHexValue(value, length): +def _pad_hex_value(value, length): """ Pad a hexadecimal value with a given length of zeros :param value: A hexadecimal value to be padded with zeros @@ -590,23 +590,23 @@ def _padHexValue(value, length): return '0x' + value[2:].zfill(length) return value -class UIntegerTypes(IntEnum): +class MaxUIntegerTypes(IntEnum): UINT8_T = 0xFF UINT16_T = 0xFFFF UINT32_T = 0xFFFFFFFF UINT64_T = 0xFFFFFFFFFFFFFFFF -def _validateIfMaxUint(valToCheck, uintType: UIntegerTypes): +def _validate_if_max_uint(value, uint_type: MaxUIntegerTypes): return_val = "N/A" - if not isinstance(valToCheck, list): - if valToCheck == uintType: + if not isinstance(value, list): + if value == uint_type: return return_val else: - return valToCheck + return value else: - return_val = valToCheck - for idx, v in enumerate(valToCheck): - if v == uintType: + return_val = value + for idx, v in enumerate(value): + if v == uint_type: return_val[idx] = "N/A" return return_val @@ -1656,15 +1656,16 @@ def amdsmi_get_gpu_asic_info( ) asic_info = { - "market_name": _padHexValue(asic_info_struct.market_name.decode("utf-8"), 4), + "market_name": _pad_hex_value(asic_info_struct.market_name.decode("utf-8"), 4), "vendor_id": asic_info_struct.vendor_id, "vendor_name": asic_info_struct.vendor_name.decode("utf-8"), "subvendor_id": asic_info_struct.subvendor_id, "device_id": asic_info_struct.device_id, - "rev_id": _padHexValue(hex(asic_info_struct.rev_id), 2), + "rev_id": _pad_hex_value(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": asic_info_struct.oam_id, - "num_compute_units": asic_info_struct.num_of_compute_units + "num_compute_units": asic_info_struct.num_of_compute_units, + "target_graphics_version": "gfx" + str(asic_info_struct.target_graphics_version) } string_values = ["market_name", "vendor_name"] @@ -1701,6 +1702,28 @@ def amdsmi_get_gpu_asic_info( return asic_info +def amdsmi_get_gpu_kfd_info( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + kfd_info_struct = amdsmi_wrapper.amdsmi_kfd_info_t() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_kfd_info( + processor_handle, ctypes.byref(kfd_info_struct)) + ) + + kfd_info = { + "kfd_id": _validate_if_max_uint(kfd_info_struct.kfd_id, MaxUIntegerTypes.UINT32_T), + "node_id": _validate_if_max_uint(kfd_info_struct.node_id, MaxUIntegerTypes.UINT64_T) + } + + return kfd_info + + def amdsmi_get_power_cap_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -1722,6 +1745,7 @@ def amdsmi_get_power_cap_info( "min_power_cap": power_info.min_power_cap, "max_power_cap": power_info.max_power_cap} + def amdsmi_get_gpu_pm_metrics_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -1749,6 +1773,7 @@ def amdsmi_get_gpu_pm_metrics_info( amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) return results + def amdsmi_get_gpu_reg_table_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, reg_type: amdsmi_wrapper.amdsmi_reg_type_t, @@ -1777,6 +1802,7 @@ def amdsmi_get_gpu_reg_table_info( amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) return results + def amdsmi_get_gpu_vram_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -1995,10 +2021,10 @@ def amdsmi_get_gpu_board_info( ) board_info_dict = { - "model_number": _padHexValue(board_info.model_number.decode("utf-8").strip(), 4), + "model_number": _pad_hex_value(board_info.model_number.decode("utf-8").strip(), 4), "product_serial": board_info.product_serial.decode("utf-8").strip(), "fru_id": board_info.fru_id.decode("utf-8").strip(), - "product_name": _padHexValue(board_info.product_name.decode("utf-8").strip(), 4), + "product_name": _pad_hex_value(board_info.product_name.decode("utf-8").strip(), 4), "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip() } @@ -2297,20 +2323,20 @@ def amdsmi_get_pcie_info( pcie_info_dict = { "pcie_static": { - "max_pcie_width": _validateIfMaxUint(pcie_info.pcie_static.max_pcie_width, UIntegerTypes.UINT16_T), - "max_pcie_speed": _validateIfMaxUint(pcie_info.pcie_static.max_pcie_speed, UIntegerTypes.UINT32_T), - "pcie_interface_version": _validateIfMaxUint(pcie_info.pcie_static.pcie_interface_version, UIntegerTypes.UINT32_T), + "max_pcie_width": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_width, MaxUIntegerTypes.UINT16_T), + "max_pcie_speed": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_speed, MaxUIntegerTypes.UINT32_T), + "pcie_interface_version": _validate_if_max_uint(pcie_info.pcie_static.pcie_interface_version, MaxUIntegerTypes.UINT32_T), "slot_type": pcie_info.pcie_static.slot_type, }, "pcie_metric": { - "pcie_width": _validateIfMaxUint(pcie_info.pcie_metric.pcie_width, UIntegerTypes.UINT16_T), - "pcie_speed": _validateIfMaxUint(pcie_info.pcie_metric.pcie_speed, UIntegerTypes.UINT32_T), - "pcie_bandwidth": _validateIfMaxUint(pcie_info.pcie_metric.pcie_bandwidth, UIntegerTypes.UINT32_T), - "pcie_replay_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_replay_count, UIntegerTypes.UINT64_T), - "pcie_l0_to_recovery_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_l0_to_recovery_count, UIntegerTypes.UINT64_T), - "pcie_replay_roll_over_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_replay_roll_over_count, UIntegerTypes.UINT64_T), - "pcie_nak_sent_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_nak_sent_count, UIntegerTypes.UINT64_T), - "pcie_nak_received_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_nak_received_count, UIntegerTypes.UINT64_T), + "pcie_width": _validate_if_max_uint(pcie_info.pcie_metric.pcie_width, MaxUIntegerTypes.UINT16_T), + "pcie_speed": _validate_if_max_uint(pcie_info.pcie_metric.pcie_speed, MaxUIntegerTypes.UINT32_T), + "pcie_bandwidth": _validate_if_max_uint(pcie_info.pcie_metric.pcie_bandwidth, MaxUIntegerTypes.UINT32_T), + "pcie_replay_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_count, MaxUIntegerTypes.UINT64_T), + "pcie_l0_to_recovery_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_l0_to_recovery_count, MaxUIntegerTypes.UINT64_T), + "pcie_replay_roll_over_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_roll_over_count, MaxUIntegerTypes.UINT64_T), + "pcie_nak_sent_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_sent_count, MaxUIntegerTypes.UINT64_T), + "pcie_nak_received_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_received_count, MaxUIntegerTypes.UINT64_T), } } @@ -2403,7 +2429,7 @@ def amdsmi_get_gpu_subsystem_id(processor_handle: amdsmi_wrapper.amdsmi_processo processor_handle, ctypes.byref(id)) ) - return _padHexValue(hex(id.value), 4) + return _pad_hex_value(hex(id.value), 4) def amdsmi_get_gpu_subsystem_name(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): @@ -2541,6 +2567,41 @@ def amdsmi_topo_get_link_type( return {"hops": hops.value, "type": type.value} +def amdsmi_topo_get_p2p_status( + processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle, + processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle, +): + if not isinstance(processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle_src, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle_dst, amdsmi_wrapper.amdsmi_processor_handle + ) + + type = ctypes.c_uint32() + cap = amdsmi_wrapper.struct_amdsmi_p2p_capability_t() + + _check_res( + amdsmi_wrapper.amdsmi_topo_get_p2p_status( + processor_handle_src, processor_handle_dst, ctypes.byref(type), ctypes.byref(cap) + ) + ) + + return { + 'type' : type, + 'cap': { + 'is_iolink_coherent': cap.is_iolink_coherent, + 'is_iolink_atomics_32bit': cap.is_iolink_atomics_32bit, + 'is_iolink_atomics_64bit': cap.is_iolink_atomics_64bit, + 'is_iolink_dma': cap.is_iolink_dma, + 'is_iolink_bi_directional': cap.is_iolink_bi_directional + } + } + + def amdsmi_is_P2P_accessible( processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle, processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle, @@ -2658,6 +2719,37 @@ def amdsmi_reset_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_pr _check_res(amdsmi_wrapper.amdsmi_reset_gpu_memory_partition(processor_handle)) +def amdsmi_get_gpu_accelerator_partition_profile( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle + ) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + partition_id = ctypes.c_uint32() + profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t() + + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, + ctypes.byref(profile), + ctypes.byref(partition_id)) + ) + + partition_profile_dict = { + "profile_type" : profile.profile_type, + "num_partitions" : profile.num_partitions, + "profile_index" : profile.profile_index, + "memory_caps" : "N/A", + "num_resources" : profile.num_resources, + "resources" : "N/A" + } + + return { + "partition_id" : partition_id.value, + "partition_profile" : partition_profile_dict + } + + def amdsmi_get_xgmi_info(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( @@ -3152,6 +3244,36 @@ def amdsmi_set_gpu_clk_range( ) +def amdsmi_set_gpu_clk_limit( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + clk_type: str, + limit_type: str, + value: int + ) -> None: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + if not isinstance(value, int): + raise AmdSmiParameterException(value, int) + if clk_type.lower() == "sclk": + clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_SYS + elif clk_type.lower() == "mclk": + clk_type_conversion = amdsmi_wrapper.AMDSMI_CLK_TYPE_MEM + if limit_type.lower() == "min": + limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MIN + elif limit_type.lower() == "max": + limit_type_conversion = amdsmi_wrapper.CLK_LIMIT_MAX + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_clk_limit( + processor_handle, + amdsmi_wrapper.amdsmi_clk_type_t(clk_type_conversion), + amdsmi_wrapper.amdsmi_clk_limit_type_t(limit_type_conversion), + ctypes.c_uint64(value), + ) + ) + + def amdsmi_get_gpu_memory_total(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, mem_type: AmdSmiMemoryType): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( @@ -3364,6 +3486,14 @@ def amdsmi_get_utilization_count( raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) + + # Enforce List typing + if not isinstance(counter_types, list): + counter_types = [counter_types] + + counter_types = list(set(counter_types)) + + # Validate Inputs if len(counter_types) == 0: raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_INVAL) counters = [] @@ -3496,7 +3626,7 @@ def amdsmi_get_clk_freq( return { "num_supported": freq.num_supported, "current": freq.current, - "frequency": list(freq.frequency)[: freq.num_supported - 1], + "frequency": list(freq.frequency)[: freq.num_supported], } @@ -4035,12 +4165,12 @@ def amdsmi_get_gpu_metrics_header_info( header_info = amdsmi_wrapper.amd_metrics_table_header_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_metrics_header_info( - ctypes.byref(header_info) + processor_handle, ctypes.byref(header_info) ) ) return { - "structure_size": header_info.structure_size.value, - "format_revision": header_info.format_revision.value, - "content_revision": header_info.content_revision.value + "structure_size": header_info.structure_size, + "format_revision": header_info.format_revision, + "content_revision": header_info.content_revision } diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 805d8d88d8..f7acf2026e 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1,6 +1,6 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -377,21 +377,38 @@ AMDSMI_CLK_TYPE_DCLK1 = 9 AMDSMI_CLK_TYPE__MAX = 9 amdsmi_clk_type_t = ctypes.c_uint32 # enum +# values for enumeration 'amdsmi_accelerator_partition_type_t' +amdsmi_accelerator_partition_type_t__enumvalues = { + 0: 'AMDSMI_ACCELERATOR_PARTITION_INVALID', + 1: 'AMDSMI_ACCELERATOR_PARTITION_SPX', + 2: 'AMDSMI_ACCELERATOR_PARTITION_DPX', + 3: 'AMDSMI_ACCELERATOR_PARTITION_TPX', + 4: 'AMDSMI_ACCELERATOR_PARTITION_QPX', + 5: 'AMDSMI_ACCELERATOR_PARTITION_CPX', +} +AMDSMI_ACCELERATOR_PARTITION_INVALID = 0 +AMDSMI_ACCELERATOR_PARTITION_SPX = 1 +AMDSMI_ACCELERATOR_PARTITION_DPX = 2 +AMDSMI_ACCELERATOR_PARTITION_TPX = 3 +AMDSMI_ACCELERATOR_PARTITION_QPX = 4 +AMDSMI_ACCELERATOR_PARTITION_CPX = 5 +amdsmi_accelerator_partition_type_t = ctypes.c_uint32 # enum + # values for enumeration 'amdsmi_compute_partition_type_t' amdsmi_compute_partition_type_t__enumvalues = { 0: 'AMDSMI_COMPUTE_PARTITION_INVALID', - 1: 'AMDSMI_COMPUTE_PARTITION_CPX', - 2: 'AMDSMI_COMPUTE_PARTITION_SPX', - 3: 'AMDSMI_COMPUTE_PARTITION_DPX', - 4: 'AMDSMI_COMPUTE_PARTITION_TPX', - 5: 'AMDSMI_COMPUTE_PARTITION_QPX', + 1: 'AMDSMI_COMPUTE_PARTITION_SPX', + 2: 'AMDSMI_COMPUTE_PARTITION_DPX', + 3: 'AMDSMI_COMPUTE_PARTITION_TPX', + 4: 'AMDSMI_COMPUTE_PARTITION_QPX', + 5: 'AMDSMI_COMPUTE_PARTITION_CPX', } AMDSMI_COMPUTE_PARTITION_INVALID = 0 -AMDSMI_COMPUTE_PARTITION_CPX = 1 -AMDSMI_COMPUTE_PARTITION_SPX = 2 -AMDSMI_COMPUTE_PARTITION_DPX = 3 -AMDSMI_COMPUTE_PARTITION_TPX = 4 -AMDSMI_COMPUTE_PARTITION_QPX = 5 +AMDSMI_COMPUTE_PARTITION_SPX = 1 +AMDSMI_COMPUTE_PARTITION_DPX = 2 +AMDSMI_COMPUTE_PARTITION_TPX = 3 +AMDSMI_COMPUTE_PARTITION_QPX = 4 +AMDSMI_COMPUTE_PARTITION_CPX = 5 amdsmi_compute_partition_type_t = ctypes.c_uint32 # enum # values for enumeration 'amdsmi_memory_partition_type_t' @@ -759,6 +776,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -777,19 +807,6 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -902,10 +919,62 @@ struct_amdsmi_asic_info_t._fields_ = [ ('asic_serial', ctypes.c_char * 32), ('oam_id', ctypes.c_uint32), ('num_of_compute_units', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 17), + ('PADDING_0', ctypes.c_ubyte * 4), + ('target_graphics_version', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 15), + ('PADDING_1', ctypes.c_ubyte * 4), ] amdsmi_asic_info_t = struct_amdsmi_asic_info_t +class struct_amdsmi_kfd_info_t(Structure): + pass + +struct_amdsmi_kfd_info_t._pack_ = 1 # source:False +struct_amdsmi_kfd_info_t._fields_ = [ + ('kfd_id', ctypes.c_uint64), + ('node_id', ctypes.c_uint32), + ('reserved', ctypes.c_uint32 * 13), +] + +amdsmi_kfd_info_t = struct_amdsmi_kfd_info_t +class union_amdsmi_nps_caps_t(Union): + pass + +class struct_nps_flags_(Structure): + pass + +struct_nps_flags_._pack_ = 1 # source:False +struct_nps_flags_._fields_ = [ + ('nps1_cap', ctypes.c_uint32, 1), + ('nps2_cap', ctypes.c_uint32, 1), + ('nps4_cap', ctypes.c_uint32, 1), + ('nps8_cap', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 28), +] + +union_amdsmi_nps_caps_t._pack_ = 1 # source:False +union_amdsmi_nps_caps_t._fields_ = [ + ('amdsmi_nps_flags_t', struct_nps_flags_), + ('nps_cap_mask', ctypes.c_uint32), +] + +amdsmi_nps_caps_t = union_amdsmi_nps_caps_t +class struct_amdsmi_accelerator_partition_profile_t(Structure): + pass + +struct_amdsmi_accelerator_partition_profile_t._pack_ = 1 # source:False +struct_amdsmi_accelerator_partition_profile_t._fields_ = [ + ('profile_type', amdsmi_accelerator_partition_type_t), + ('num_partitions', ctypes.c_uint32), + ('profile_index', ctypes.c_uint32), + ('memory_caps', amdsmi_nps_caps_t), + ('num_resources', ctypes.c_uint32), + ('resources', ctypes.c_uint32 * 32 * 8), + ('PADDING_0', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint64 * 6), +] + +amdsmi_accelerator_partition_profile_t = struct_amdsmi_accelerator_partition_profile_t # values for enumeration 'amdsmi_link_type_t' amdsmi_link_type_t__enumvalues = { @@ -1031,6 +1100,16 @@ amdsmi_process_handle_t = ctypes.c_uint32 class struct_amdsmi_proc_info_t(Structure): pass +class struct_engine_usage_(Structure): + pass + +struct_engine_usage_._pack_ = 1 # source:False +struct_engine_usage_._fields_ = [ + ('gfx', ctypes.c_uint64), + ('enc', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 12), +] + class struct_memory_usage_(Structure): pass @@ -1042,16 +1121,6 @@ struct_memory_usage_._fields_ = [ ('reserved', ctypes.c_uint32 * 10), ] -class struct_engine_usage_(Structure): - pass - -struct_engine_usage_._pack_ = 1 # source:False -struct_engine_usage_._fields_ = [ - ('gfx', ctypes.c_uint64), - ('enc', ctypes.c_uint64), - ('reserved', ctypes.c_uint32 * 12), -] - struct_amdsmi_proc_info_t._pack_ = 1 # source:False struct_amdsmi_proc_info_t._fields_ = [ ('name', ctypes.c_char * 32), @@ -1065,6 +1134,19 @@ struct_amdsmi_proc_info_t._fields_ = [ ] amdsmi_proc_info_t = struct_amdsmi_proc_info_t +class struct_amdsmi_p2p_capability_t(Structure): + pass + +struct_amdsmi_p2p_capability_t._pack_ = 1 # source:False +struct_amdsmi_p2p_capability_t._fields_ = [ + ('is_iolink_coherent', ctypes.c_ubyte), + ('is_iolink_atomics_32bit', ctypes.c_ubyte), + ('is_iolink_atomics_64bit', ctypes.c_ubyte), + ('is_iolink_dma', ctypes.c_ubyte), + ('is_iolink_bi_directional', ctypes.c_ubyte), +] + +amdsmi_p2p_capability_t = struct_amdsmi_p2p_capability_t # values for enumeration 'amdsmi_dev_perf_level_t' amdsmi_dev_perf_level_t__enumvalues = { @@ -2050,7 +2132,7 @@ amdsmi_set_gpu_fan_speed.restype = amdsmi_status_t amdsmi_set_gpu_fan_speed.argtypes = [amdsmi_processor_handle, uint32_t, uint64_t] amdsmi_get_utilization_count = _libraries['libamd_smi.so'].amdsmi_get_utilization_count amdsmi_get_utilization_count.restype = amdsmi_status_t -amdsmi_get_utilization_count.argtypes = [amdsmi_processor_handle, struct_amdsmi_utilization_counter_t * 0, uint32_t, ctypes.POINTER(ctypes.c_uint64)] +amdsmi_get_utilization_count.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_utilization_counter_t), uint32_t, ctypes.POINTER(ctypes.c_uint64)] amdsmi_get_gpu_perf_level = _libraries['libamd_smi.so'].amdsmi_get_gpu_perf_level amdsmi_get_gpu_perf_level.restype = amdsmi_status_t amdsmi_get_gpu_perf_level.argtypes = [amdsmi_processor_handle, ctypes.POINTER(amdsmi_dev_perf_level_t)] @@ -2201,6 +2283,9 @@ amdsmi_topo_get_link_type.argtypes = [amdsmi_processor_handle, amdsmi_processor_ amdsmi_is_P2P_accessible = _libraries['libamd_smi.so'].amdsmi_is_P2P_accessible amdsmi_is_P2P_accessible.restype = amdsmi_status_t amdsmi_is_P2P_accessible.argtypes = [amdsmi_processor_handle, amdsmi_processor_handle, ctypes.POINTER(ctypes.c_bool)] +amdsmi_topo_get_p2p_status = _libraries['libamd_smi.so'].amdsmi_topo_get_p2p_status +amdsmi_topo_get_p2p_status.restype = amdsmi_status_t +amdsmi_topo_get_p2p_status.argtypes = [amdsmi_processor_handle, amdsmi_processor_handle, ctypes.POINTER(amdsmi_io_link_type_t), ctypes.POINTER(struct_amdsmi_p2p_capability_t)] amdsmi_get_gpu_compute_partition = _libraries['libamd_smi.so'].amdsmi_get_gpu_compute_partition amdsmi_get_gpu_compute_partition.restype = amdsmi_status_t amdsmi_get_gpu_compute_partition.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t] @@ -2219,6 +2304,9 @@ amdsmi_set_gpu_memory_partition.argtypes = [amdsmi_processor_handle, amdsmi_memo amdsmi_reset_gpu_memory_partition = _libraries['libamd_smi.so'].amdsmi_reset_gpu_memory_partition amdsmi_reset_gpu_memory_partition.restype = amdsmi_status_t amdsmi_reset_gpu_memory_partition.argtypes = [amdsmi_processor_handle] +amdsmi_get_gpu_accelerator_partition_profile = _libraries['libamd_smi.so'].amdsmi_get_gpu_accelerator_partition_profile +amdsmi_get_gpu_accelerator_partition_profile.restype = amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_accelerator_partition_profile_t), ctypes.POINTER(ctypes.c_uint32)] amdsmi_init_gpu_event_notification = _libraries['libamd_smi.so'].amdsmi_init_gpu_event_notification amdsmi_init_gpu_event_notification.restype = amdsmi_status_t amdsmi_init_gpu_event_notification.argtypes = [amdsmi_processor_handle] @@ -2243,6 +2331,9 @@ amdsmi_get_gpu_driver_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(s amdsmi_get_gpu_asic_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_asic_info amdsmi_get_gpu_asic_info.restype = amdsmi_status_t amdsmi_get_gpu_asic_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_asic_info_t)] +amdsmi_get_gpu_kfd_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_kfd_info +amdsmi_get_gpu_kfd_info.restype = amdsmi_status_t +amdsmi_get_gpu_kfd_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_kfd_info_t)] amdsmi_get_gpu_vram_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_vram_info amdsmi_get_gpu_vram_info.restype = amdsmi_status_t amdsmi_get_gpu_vram_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_vram_info_t)] @@ -2413,7 +2504,12 @@ amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg amdsmi_get_esmi_err_msg.restype = amdsmi_status_t amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))] __all__ = \ - ['AGG_BW0', 'AMDSMI_AVERAGE_POWER', + ['AGG_BW0', 'AMDSMI_ACCELERATOR_PARTITION_CPX', + 'AMDSMI_ACCELERATOR_PARTITION_DPX', + 'AMDSMI_ACCELERATOR_PARTITION_INVALID', + 'AMDSMI_ACCELERATOR_PARTITION_QPX', + 'AMDSMI_ACCELERATOR_PARTITION_SPX', + 'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_AVERAGE_POWER', 'AMDSMI_CACHE_PROPERTY_CPU_CACHE', 'AMDSMI_CACHE_PROPERTY_DATA_CACHE', 'AMDSMI_CACHE_PROPERTY_ENABLED', @@ -2617,21 +2713,23 @@ __all__ = \ 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', 'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN', 'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t', - 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', - 'amdsmi_board_info_t', 'amdsmi_cache_property_type_t', - 'amdsmi_card_form_factor_t', 'amdsmi_clean_gpu_local_data', - 'amdsmi_clk_info_t', 'amdsmi_clk_limit_type_t', - 'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t', - 'amdsmi_container_types_t', 'amdsmi_counter_command_t', - 'amdsmi_counter_value_t', 'amdsmi_cpu_apb_disable', - 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', - 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', - 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t', - 'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t', - 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', - 'amdsmi_event_group_t', 'amdsmi_event_handle_t', - 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', + 'amdsmi_accelerator_partition_profile_t', + 'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t', + 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', + 'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t', + 'amdsmi_clean_gpu_local_data', 'amdsmi_clk_info_t', + 'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t', + 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', + 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', + 'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable', + 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', + 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', + 'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t', + 'amdsmi_dpm_policy_entry_t', 'amdsmi_dpm_policy_t', + 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', + 'amdsmi_error_count_t', 'amdsmi_event_group_t', + 'amdsmi_event_handle_t', 'amdsmi_event_type_t', + 'amdsmi_evt_notification_data_t', 'amdsmi_evt_notification_type_t', 'amdsmi_first_online_core_on_cpu_socket', 'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t', @@ -2661,6 +2759,7 @@ __all__ = \ 'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles', 'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', + 'amdsmi_get_gpu_accelerator_partition_profile', 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', @@ -2674,7 +2773,8 @@ __all__ = \ 'amdsmi_get_gpu_ecc_enabled', 'amdsmi_get_gpu_ecc_status', 'amdsmi_get_gpu_event_notification', 'amdsmi_get_gpu_fan_rpms', 'amdsmi_get_gpu_fan_speed', 'amdsmi_get_gpu_fan_speed_max', - 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_mem_overdrive_level', + 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_kfd_info', + 'amdsmi_get_gpu_mem_overdrive_level', 'amdsmi_get_gpu_memory_partition', 'amdsmi_get_gpu_memory_reserved_pages', 'amdsmi_get_gpu_memory_total', 'amdsmi_get_gpu_memory_usage', @@ -2720,12 +2820,13 @@ __all__ = \ 'amdsmi_init', 'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t', 'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible', - 'amdsmi_is_gpu_power_management_enabled', + 'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t', 'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t', 'amdsmi_link_type_t', 'amdsmi_memory_page_status_t', 'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t', - 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_od_vddc_point_t', - 'amdsmi_od_volt_curve_t', 'amdsmi_od_volt_freq_data_t', + 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_nps_caps_t', + 'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t', + 'amdsmi_od_volt_freq_data_t', 'amdsmi_p2p_capability_t', 'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t', 'amdsmi_power_cap_info_t', 'amdsmi_power_info_t', 'amdsmi_power_profile_preset_masks_t', @@ -2761,7 +2862,7 @@ __all__ = \ 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', - 'amdsmi_utilization_counter_t', + 'amdsmi_topo_get_p2p_status', 'amdsmi_utilization_counter_t', 'amdsmi_utilization_counter_type_t', 'amdsmi_vbios_info_t', 'amdsmi_version_t', 'amdsmi_voltage_metric_t', 'amdsmi_voltage_type_t', 'amdsmi_vram_info_t', @@ -2769,6 +2870,7 @@ __all__ = \ 'amdsmi_vram_vendor_type_t', 'amdsmi_xgmi_info_t', 'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t', 'struct__links', 'struct_amd_metrics_table_header_t', + 'struct_amdsmi_accelerator_partition_profile_t', 'struct_amdsmi_asic_info_t', 'struct_amdsmi_board_info_t', 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', @@ -2780,11 +2882,12 @@ __all__ = \ 'struct_amdsmi_freq_volt_region_t', 'struct_amdsmi_frequencies_t', 'struct_amdsmi_frequency_range_t', 'struct_amdsmi_fw_info_t', 'struct_amdsmi_gpu_cache_info_t', 'struct_amdsmi_gpu_metrics_t', - 'struct_amdsmi_hsmp_metrics_table_t', + 'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t', 'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t', 'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t', 'struct_amdsmi_od_volt_freq_data_t', + 'struct_amdsmi_p2p_capability_t', 'struct_amdsmi_pcie_bandwidth_t', 'struct_amdsmi_pcie_info_t', 'struct_amdsmi_power_cap_info_t', 'struct_amdsmi_power_info_t', 'struct_amdsmi_power_profile_status_t', @@ -2798,7 +2901,7 @@ __all__ = \ 'struct_amdsmi_vram_info_t', 'struct_amdsmi_vram_usage_t', 'struct_amdsmi_xgmi_info_t', 'struct_cache_', 'struct_engine_usage_', 'struct_fw_info_list_', - 'struct_memory_usage_', 'struct_pcie_metric_', - 'struct_pcie_static_', 'struct_amdsmi_bdf_t', - 'uint32_t', 'uint64_t', 'uint8_t', - 'union_amdsmi_bdf_t'] + 'struct_memory_usage_', 'struct_nps_flags_', + 'struct_pcie_metric_', 'struct_pcie_static_', + 'struct_amdsmi_bdf_t','uint32_t', 'uint64_t', 'uint8_t', + 'union_amdsmi_bdf_t', 'union_amdsmi_nps_caps_t'] diff --git a/py-interface/pyproject.toml.in b/py-interface/pyproject.toml.in index 149fc21a1b..e4d451aa82 100644 --- a/py-interface/pyproject.toml.in +++ b/py-interface/pyproject.toml.in @@ -16,7 +16,7 @@ readme = {file = "amdsmi/README.md", content-type = "text/markdown"} description = "AMDSMI Python LIB - AMD GPU Monitoring Library" requires-python = ">=3.6" dependencies = [ - "PyYAML >= 5.0", + "PyYAML >= 3.12", ] [project.urls] diff --git a/py-interface/setup.cfg.in b/py-interface/setup.cfg.in index 3e8b8c8732..ba56f2ac05 100644 --- a/py-interface/setup.cfg.in +++ b/py-interface/setup.cfg.in @@ -18,7 +18,7 @@ include_package_data = True packages = find: python_requires = >=3.6 install_requires= - PyYAML >= 5.0 + PyYAML >= 3.12 [options.package_data] * = *.so diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc old mode 100755 new mode 100644 index fa54728b5e..0aed74aec6 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -53,6 +53,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -730,30 +731,6 @@ template constexpr float convert_mw_to_w(T mw) { return static_cast(mw / 1000.0); } -template -auto print_error_or_value(rsmi_status_t status_code, const T& metric) { - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - if constexpr (std::is_array_v) { - auto idx = uint16_t(0); - auto str_values = std::string(); - const auto num_elems = static_cast(std::end(metric) - std::begin(metric)); - str_values = ("\n\t\t num of values: " + std::to_string(num_elems) + "\n"); - for (const auto& el : metric) { - str_values += "\t\t [" + std::to_string(idx) + "]: " + std::to_string(el) + "\n"; - ++idx; - } - return str_values; - } - else if constexpr ((std::is_same_v) || - (std::is_same_v) || - (std::is_same_v)) { - return std::to_string(metric); - } - } - else { - return ("\n\t\tStatus: [" + std::to_string(status_code) + "] " + "-> " + amd::smi::getRSMIStatusString(status_code)); - } -}; template std::string print_unsigned_int(T value) { @@ -780,6 +757,7 @@ int main() { uint32_t num_monitor_devs = 0; rsmi_gpu_metrics_t gpu_metrics; std::string val_str; + RSMI_POWER_TYPE power_type = RSMI_INVALID_POWER; rsmi_num_monitor_devices(&num_monitor_devs); @@ -791,13 +769,23 @@ int main() { ret = rsmi_dev_revision_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << "\n"; - ret = amd::smi::rsmi_get_gfx_target_version(i , &val_str); - std::cout << "\t**Target Graphics Version: " << val_str << "\n"; - - char pcie_vendor_name[256]; - ret = rsmi_dev_pcie_vendor_name_get(i, pcie_vendor_name, 256); - CHK_RSMI_RET_I(ret) - std::cout << "\t**PCIe vendor name: " << pcie_vendor_name << std::endl; + ret = rsmi_dev_target_graphics_version_get(i, &val_ui64); + std::cout << "\t**Target Graphics Version: " << std::dec + << static_cast(val_ui64) << "\n"; + ret = rsmi_dev_guid_get(i, &val_ui64); + std::cout << "\t**GUID: " << std::dec + << static_cast(val_ui64) << "\n"; + ret = rsmi_dev_node_id_get(i, &val_ui32); + std::cout << "\t**Node ID: " << std::dec + << static_cast(val_ui32) << "\n"; + char vbios_version[256]; + ret = rsmi_dev_vbios_version_get(i, vbios_version, 256); + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << "\t**VBIOS Version: " << vbios_version << "\n"; + } else { + std::cout << "\t**VBIOS Version: " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } char current_compute_partition[256]; current_compute_partition[0] = '\0'; @@ -848,8 +836,9 @@ int main() { // std::cout << "\n"; print_test_header("GPU METRICS: Using static struct (Backwards Compatibility) ", i); - print_function_header_with_rsmi_ret(ret, "rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &gpu_metrics)"); - rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics); + ret = rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics); + print_function_header_with_rsmi_ret(ret, "rsmi_dev_gpu_metrics_info_get(" + + std::to_string(i) + ", &gpu_metrics)"); std::cout << "\t**.common_header.format_revision : " << print_unsigned_int(gpu_metrics.common_header.format_revision) << "\n"; @@ -988,173 +977,58 @@ int main() { for (const auto& dclk : gpu_metrics.current_dclk0s) { std::cout << "\t -> " << std::dec << dclk << "\n"; } - std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n"; + + std::cout << "\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + rsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx + << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx + << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n\n"; + std::cout << " ** Note: Values MAX'ed out " + "(UINTX MAX are unsupported for the version in question) ** " << "\n"; + std::cout << "\n\n"; print_test_header("GPU METRICS: Using direct APIs (newer)", i); metrics_table_header_t header_values; - GPUMetricTempHbm_t hbm_values; - GPUMetricVcnActivity_t vcn_values; - GPUMetricXgmiReadDataAcc_t xgmi_read_values; - GPUMetricXgmiWriteDataAcc_t xgmi_write_values; - GPUMetricCurrGfxClk_t curr_gfxclk_values; - GPUMetricCurrSocClk_t curr_socclk_values; - GPUMetricCurrVClk0_t curr_vclk0_values; - GPUMetricCurrDClk0_t curr_dclk0_values; ret = rsmi_dev_metrics_header_info_get(i, &header_values); std::cout << "\t[Metrics Header]" << "\n"; - std::cout << "\t -> format_revision : " << print_unsigned_int(header_values.format_revision) << "\n"; - std::cout << "\t -> content_revision : " << print_unsigned_int(header_values.content_revision) << "\n"; + std::cout << "\t -> format_revision : " + << print_unsigned_int(header_values.format_revision) << "\n"; + std::cout << "\t -> content_revision : " + << print_unsigned_int(header_values.content_revision) << "\n"; std::cout << "\t--------------------" << "\n"; - std::cout << "\n"; - std::cout << "\t[Temperature]" << "\n"; - ret = rsmi_dev_metrics_temp_edge_get(i, &val_ui16); - std::cout << "\t -> temp_edge(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_hotspot_get(i, &val_ui16); - std::cout << "\t -> temp_hotspot(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_mem_get(i, &val_ui16); - std::cout << "\t -> temp_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrgfx_get(i, &val_ui16); - std::cout << "\t -> temp_vrgfx(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrsoc_get(i, &val_ui16); - std::cout << "\t -> temp_vrsoc(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrmem_get(i, &val_ui16); - std::cout << "\t -> temp_vrmem(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_hbm_get(i, &hbm_values); - std::cout << "\t -> temp_hbm(): " << print_error_or_value(ret, hbm_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Power/Energy]" << "\n"; - ret = rsmi_dev_metrics_curr_socket_power_get(i, &val_ui16); - std::cout << "\t -> current_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_energy_acc_get(i, &val_ui64); - std::cout << "\t -> energy_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_avg_socket_power_get(i, &val_ui16); - std::cout << "\t -> average_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Utilization]" << "\n"; - ret = rsmi_dev_metrics_avg_gfx_activity_get(i, &val_ui16); - std::cout << "\t -> average_gfx_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_umc_activity_get(i, &val_ui16); - std::cout << "\t -> average_umc_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_mm_activity_get(i, &val_ui16); - std::cout << "\t -> average_mm_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_vcn_activity_get(i, &vcn_values); - std::cout << "\t -> vcn_activity(): " << print_error_or_value(ret, vcn_values) << "\n"; - ret = rsmi_dev_metrics_mem_activity_acc_get(i, &val_ui32); - std::cout << "\t -> mem_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; - ret = rsmi_dev_metrics_gfx_activity_acc_get(i, &val_ui32); - std::cout << "\t -> gfx_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Average Clock]" << "\n"; - ret = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_gfx_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_soc_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_uclock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_uclock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &val_ui16); - std::cout << "\t -> average_vclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &val_ui16); - std::cout << "\t -> average_dclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &val_ui16); - std::cout << "\t -> average_vclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &val_ui16); - std::cout << "\t -> average_dclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Current Clock]" << "\n"; - ret = rsmi_dev_metrics_curr_vclk1_get(i, &val_ui16); - std::cout << "\t -> current_vclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_dclk1_get(i, &val_ui16); - std::cout << "\t -> current_dclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_uclk_get(i, &val_ui16); - std::cout << "\t -> current_uclock(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_dclk0_get(i, &curr_dclk0_values); - std::cout << "\t -> current_dclk0(): " << print_error_or_value(ret, curr_dclk0_values) << "\n"; - ret = rsmi_dev_metrics_curr_gfxclk_get(i, &curr_gfxclk_values); - std::cout << "\t -> current_gfxclk(): " << print_error_or_value(ret, curr_gfxclk_values) << "\n"; - ret = rsmi_dev_metrics_curr_socclk_get(i, &curr_socclk_values); - std::cout << "\t -> current_soc_clock(): " << print_error_or_value(ret, curr_socclk_values) << "\n"; - ret = rsmi_dev_metrics_curr_vclk0_get(i, &curr_vclk0_values); - std::cout << "\t -> current_vclk0(): " << print_error_or_value(ret, curr_vclk0_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Throttle]" << "\n"; - ret = rsmi_dev_metrics_indep_throttle_status_get(i, &val_ui64); - std::cout << "\t -> indep_throttle_status(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_throttle_status_get(i, &val_ui32); - std::cout << "\t -> throttle_status(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Gfx Clock Lock]" << "\n"; - ret = rsmi_dev_metrics_gfxclk_lock_status_get(i, &val_ui32); - std::cout << "\t -> gfxclk_lock_status(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Current Fan Speed]" << "\n"; - ret = rsmi_dev_metrics_curr_fan_speed_get(i, &val_ui16); - std::cout << "\t -> current_fan_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; - ret = rsmi_dev_metrics_pcie_link_width_get(i, &val_ui16); - std::cout << "\t -> pcie_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_pcie_link_speed_get(i, &val_ui16); - std::cout << "\t -> pcie_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_bandwidth_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &val_ui64); - std::cout << "\t -> pcie_bandwidth_inst(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_l0_recov_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_replay_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_replay_rollover_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_xgmi_link_width_get(i, &val_ui16); - std::cout << "\t -> xgmi_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_xgmi_link_speed_get(i, &val_ui16); - std::cout << "\t -> xgmi_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_xgmi_read_data_get(i, &xgmi_read_values); - std::cout << "\t -> xgmi_read_data(): " << print_error_or_value(ret, xgmi_read_values) << "\n"; - ret = rsmi_dev_metrics_xgmi_write_data_get(i, &xgmi_write_values); - std::cout << "\t -> xgmi_write_data(): " << print_error_or_value(ret, xgmi_write_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Voltage]" << "\n"; - ret = rsmi_dev_metrics_volt_soc_get(i, &val_ui16); - std::cout << "\t -> voltage_soc(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_volt_gfx_get(i, &val_ui16); - std::cout << "\t -> voltage_gfx(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_volt_mem_get(i, &val_ui16); - std::cout << "\t -> voltage_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Timestamp]" << "\n"; - ret = rsmi_dev_metrics_system_clock_counter_get(i, &val_ui64); - std::cout << "\t -> system_clock_counter(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_firmware_timestamp_get(i, &val_ui64); - std::cout << "\t -> firmware_timestamp(): " << print_error_or_value(ret, val_ui64) << "\n"; - std::cout << "\n"; std::cout << "\t[XCD CounterVoltage]" << "\n"; ret = rsmi_dev_metrics_xcd_counter_get(i, &val_ui16); - std::cout << "\t -> xcd_counter(): " << print_error_or_value(ret, val_ui16) << "\n"; + std::cout << "\t -> xcd_counter(): " << val_ui16; std::cout << "\n\n"; - ret = rsmi_dev_perf_level_get(i, &pfl); CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Performance Level:" << perf_level_string(pfl) << "\n"; ret = rsmi_dev_overdrive_level_get(i, &val_ui32); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**OverDrive Level:" << val_ui32 << "\n"; + std::cout << "\t**OverDrive Level: "; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << val_ui32 << "\n"; + } else { + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) + } print_test_header("GPU Clocks", i); for (int clkType = static_cast(RSMI_CLK_TYPE_SYS); @@ -1271,9 +1145,6 @@ int main() { } for (uint32_t i = 0; i < num_monitor_devs; ++i) { - ret = test_set_overdrive(i); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - ret = test_set_perf_level(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) @@ -1294,6 +1165,9 @@ int main() { ret = test_set_memory_partition(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) + + ret = test_set_overdrive(i); + CHK_RSMI_NOT_SUPPORTED_RET(ret) } return 0; diff --git a/rocm_smi/include/rocm_smi/kfd_ioctl.h b/rocm_smi/include/rocm_smi/kfd_ioctl.h old mode 100755 new mode 100644 diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h old mode 100755 new mode 100644 index a9dcaa18ce..3eef23ce24 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -361,7 +361,6 @@ typedef enum { RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT, - RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, @@ -415,8 +414,6 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work //!< together with shared memory RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work @@ -425,6 +422,8 @@ typedef enum { //!< work together with shared memory RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -797,7 +796,6 @@ typedef struct { uint16_t fine_value_count; } rsmi_utilization_counter_t; - /** * @brief Reserved Memory Page Record */ @@ -866,6 +864,17 @@ typedef struct { typedef rsmi_frequencies_t rsmi_frequencies; /// \endcond +/** + * @brief IO Link P2P Capability + */ +typedef struct { + uint8_t is_iolink_coherent; // 1 = true, 0 = false, UINT8_MAX = Not defined. + uint8_t is_iolink_atomics_32bit; + uint8_t is_iolink_atomics_64bit; + uint8_t is_iolink_dma; + uint8_t is_iolink_bi_directional; +} rsmi_p2p_capability_t; + /** * @brief This structure holds information about the possible PCIe * bandwidths. Specifically, the possible transfer rates and their @@ -894,7 +903,7 @@ typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth; */ typedef struct { /* Utilization */ - uint16_t average_gfx_activity; + uint16_t average_gfx_activity; //!< Average graphics activity uint16_t average_umc_activity; //!< memory controller uint16_t average_mm_activity; //!< UVD or VCN } rsmi_activity_metric_counter_t; @@ -1027,10 +1036,6 @@ struct metrics_table_header_t { typedef struct metrics_table_header_t metrics_table_header_t; /// \endcond -/** - * @brief The following structure holds the gpu metrics values for a device. - */ - /** * @brief Unit conversion factor for HBM temperatures */ @@ -1087,7 +1092,7 @@ typedef struct { */ struct metrics_table_header_t common_header; - // Temperature + // Temperature (C) uint16_t temperature_edge; uint16_t temperature_hotspot; uint16_t temperature_mem; @@ -1095,19 +1100,19 @@ typedef struct { uint16_t temperature_vrsoc; uint16_t temperature_vrmem; - // Utilization + // Utilization (%) uint16_t average_gfx_activity; uint16_t average_umc_activity; // memory controller uint16_t average_mm_activity; // UVD or VCN - // Power/Energy + // Power (W) /Energy (15.259uJ per 1ns) uint16_t average_socket_power; uint64_t energy_accumulator; // v1 mod. (32->64) // Driver attached timestamp (in ns) uint64_t system_clock_counter; // v1 mod. (moved from top of struct) - // Average clocks + // Average clocks (MHz) uint16_t average_gfxclk_frequency; uint16_t average_socclk_frequency; uint16_t average_uclk_frequency; @@ -1116,7 +1121,7 @@ typedef struct { uint16_t average_vclk1_frequency; uint16_t average_dclk1_frequency; - // Current clocks + // Current clocks (MHz) uint16_t current_gfxclk; uint16_t current_socclk; uint16_t current_uclk; @@ -1128,10 +1133,10 @@ typedef struct { // Throttle status uint32_t throttle_status; - // Fans + // Fans (RPM) uint16_t current_fan_speed; - // Link width/speed + // Link width (number of lanes) /speed (0.1 GT/s) uint16_t pcie_link_width; // v1 mod.(8->16) uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) @@ -1147,7 +1152,7 @@ typedef struct { /* * v1.2 additions */ - // PMFW attached timestamp (10ns resolution) + // PMFW attached timestamp (10ns resolution) uint64_t firmware_timestamp; @@ -1170,19 +1175,19 @@ typedef struct { uint16_t current_socket_power; // Utilization (%) - uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode) + uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode) // Clock Lock Status. Each bit corresponds to clock instance uint32_t gfxclk_lock_status; - // XGMI bus width and bitrate (in Gbps) + // XGMI bus width and bitrate (in GB/s) uint16_t xgmi_link_width; uint16_t xgmi_link_speed; // PCIE accumulated bandwidth (GB/sec) uint64_t pcie_bandwidth_acc; - // PCIE instantaneous bandwidth (GB/sec) + // PCIE instantaneous bandwidth (GB/sec) uint64_t pcie_bandwidth_inst; // PCIE L0 to recovery state transition accumulated count @@ -1436,7 +1441,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku); +rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); /** * @brief Get the device vendor id associated with the device with provided @@ -1819,13 +1824,62 @@ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); * * @param[in] dv_ind a device index * - * @param[inout] revision a pointer to uint32_t to which the XGMI physical id + * @param[inout] id a pointer to uint32_t to which the XGMI physical id * will be written * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * */ -rsmi_status_t rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id); +rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id); + +/** + * @brief Get the GUID, also known as the GPU device id, + * associated with the provided device index indicated by KFD. + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t + * @p guid, this function will write the KFD GPU id value to the + * uint64_t pointed to by @p guid. + * + * @param[in] dv_ind a device index + * + * @param[inout] guid a pointer to uint64_t to which the KFD gpu id will be + * written. If the @p guid parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If the GPU ID is not supported with + * the device index queried, gpu_id will return MAX UINT64 value an + * arguments and ::RSMI_STATUS_NOT_SUPPORTED as a response. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid); + +/** + * @brief Get the node id associated with the provided device index + * indicated by KFD. + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t + * @p node_id, this function will write the KFD node id value to the + * uint32_t pointed to by @p node_id. + * + * @param[in] dv_ind a device index + * + * @param[inout] node_id a pointer to uint64_t to which the KFD gpu id will be + * written. If the @p node_id parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If @p node_id is not supported with + * the device index queried, @p node_id will return MAX UINT64 value as an + * argument and ::RSMI_STATUS_NOT_SUPPORTED as a response. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id); + /** @} */ // end of IDQuer @@ -1866,16 +1920,18 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *bandwidth); * * The format of @p bdfid will be as follows: * - * BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) | - * ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7) + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((Partition & 0xF) << 28) + * | ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) + * | (FUNCTION & 0x7) * - * | Name | Field | - * ---------- | ------- | - * | Domain | [64:32] | - * | Reserved | [31:16] | - * | Bus | [15: 8] | - * | Device | [ 7: 3] | - * | Function | [ 2: 0] | + * | Name | Field | KFD property KFD -> PCIe ID (uint64_t) + * -------------- | ------- | ---------------- | ---------------------------- | + * | Domain | [63:32] | "domain" | (DOMAIN & 0xFFFFFFFF) << 32 | + * | Partition id | [31:28] | "location id" | (LOCATION & 0xF0000000) | + * | Reserved | [27:16] | "location id" | N/A | + * | Bus | [15: 8] | "location id" | (LOCATION & 0xFF00) | + * | Device | [ 7: 3] | "location id" | (LOCATION & 0xF8) | + * | Function | [ 2: 0] | "location id" | (LOCATION & 0x7) | * * @param[in] dv_ind a device index * @@ -2022,6 +2078,11 @@ rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask); * @p power, this function will write the current average power consumption * (in microwatts) to the uint64_t pointed to by @p power. * + * @deprecated ::rsmi_dev_power_get() is preferred due to providing + * backwards compatibility, which looks at both average and current power + * values. Whereas ::rsmi_dev_power_ave_get only looks for average power + * consumption. Newer ASICs will support current power only. + * * @param[in] dv_ind a device index * * @param[in] sensor_ind a 0-based sensor index. Normally, this will be 0. @@ -2090,7 +2151,10 @@ rsmi_dev_current_socket_power_get(uint32_t dv_ind, uint64_t *socket_power); * @param[inout] type a pointer to RSMI_POWER_TYPE object. Returns the type * of power retrieved from the device. Current power is ::RSMI_CURRENT_POWER * and average power is ::RSMI_AVERAGE_POWER. If an error occurs, - * returns an invalid power type ::RSMI_INVALID_POWER. + * returns an invalid power type ::RSMI_INVALID_POWER - example device + * neither supports average power or current power. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not @@ -3068,7 +3132,6 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, uint64_t clkvalue, rsmi_clk_type_t clkType); - /** * @brief This function sets the clock frequency information * @@ -3614,6 +3677,29 @@ rsmi_status_t rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, uint64_t *fw_version); +/** + * @brief Get the target graphics version for a GPU device + * + * @details Given a device ID @p dv_ind and a uint64_t pointer + * @p gfx_version, this function will write the graphics version. + * + * @param[in] dv_ind a device index + * + * @param[inout] gfx_version The device graphics version number indicated by + * KFD. If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If device does not support this value, + * will return ::RSMI_STATUS_NOT_SUPPORTED and a maximum UINT64 value as + * @p gfx_version. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind, + uint64_t *gfx_version); + /** @} */ // end of VersQuer /*****************************************************************************/ @@ -4326,6 +4412,37 @@ rsmi_status_t rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, bool *accessible); +/** + * @brief Retrieve connection type and P2P capabilities between 2 GPUs + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * + * @details Given a source processor handle @p processor_handle_src and + * a destination processor handle @p processor_handle_dst, a pointer to an amdsmi_io_link_type_t @p type, + * and a pointer to rsmi_p2p_capability_t @p cap. This function will write the connection type, + * and io link capabilities between the device + * @p processor_handle_src and @p processor_handle_dst to the memory + * pointed to by @p cap and @p type. + * + * @param[in] dv_ind_src the source device index + * + * @param[in] dv_ind_dst the destination device index + * + * @param[inout] type A pointer to an ::RSMI_IO_LINK_TYPE to which the + * type for the connection should be written. + * + * @param[in,out] cap A pointer to an ::rsmi_p2p_capability_t to which the + * io link capabilities should be written. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + */ +rsmi_status_t +rsmi_topo_get_p2p_status(uint32_t dv_ind_src, uint32_t dv_ind_dst, + RSMI_IO_LINK_TYPE *type, rsmi_p2p_capability_t *cap); + /** @} */ // end of HWTopo /*****************************************************************************/ @@ -4414,6 +4531,30 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); +/** + * @brief Retrieves the partition_id for a desired device + * + * @details + * Given a device index @p dv_ind and a uint32_t pointer @p partition_id , + * this function will attempt to obtain the device's partition ID. + * Upon successful retreival, the obtained device's partition will be stored + * in the passed @p partition_id uint32_t variable. If device does + * not support partitions or is in SPX, a @p partition_id ID of 0 shall + * be returned. + * + * @param[in] dv_ind a device index + * + * @param[inout] partition_id a uint32_t variable, + * which the device's partition_id will be written to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id); + /** @} */ // end of ComputePartition /*****************************************************************************/ @@ -4855,995 +4996,6 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind); * @{ */ -/** - * Metric multi-valued counter types - */ -typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES]; -typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS]; -typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS]; -typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; -typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; -typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS]; -typedef uint16_t GPUMetricCurrSocClk_t[RSMI_MAX_NUM_CLKS]; -typedef uint16_t GPUMetricCurrVClk0_t[RSMI_MAX_NUM_CLKS]; -typedef uint16_t GPUMetricCurrDClk0_t[RSMI_MAX_NUM_CLKS]; - - -/****** - * Metric single-valued counter types - */ - -/** - * @brief Get the 'temp_hotspot' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_hotspot' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] hotspot_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value); - -/** - * @brief Get the 'temp_mem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_mem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mem_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value); - -/** - * @brief Get the 'temp_vrsoc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrsoc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrsoc_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value); - -/** - * @brief Get the 'curr_socket_power' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'socket_power' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); - -/** - * @brief Get the 'avg_gfx_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'gfx_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfx_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value); - -/** - * @brief Get the 'avg_umc_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'umc_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] umc_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value); - -/** - * @brief Get the 'energy_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'energy_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] energy_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value); - -/** - * @brief Get the 'system_clock_counter' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'system_clock_counter' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] system_clock_counter_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value); - -/** - * @brief Get the 'firmware_timestamp' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'firmware_timestamp' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] firmware_timestamp_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value); - -/** - * @brief Get the 'throttle_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'throttle_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] throttle_status_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value); - -/** - * @brief Get the 'pcie_link_width' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'pcie_link_width' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_link_width_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value); - -/** - * @brief Get the 'pcie_link_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'pcie_link_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_link_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value); - -/** - * @brief Get the 'xgmi_link_width' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'xgmi_link_width' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_link_width_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value); - -/** - * @brief Get the 'xgmi_link_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'xgmi_link_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_link_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value); - -/** - * @brief Get the 'gfxclk_lock_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'gfxclk_lock_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfxclk_lock_status_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value); - -/** - * @brief Get the 'gfx_activity_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'gfx_activity_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfx_activity_acc_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value); - -/** - * @brief Get the 'mem_activity_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'mem_activity_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mem_activity_acc_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value); - -/** - * @brief Get the 'pcie_bandwidth_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_bandwidth_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_bandwidth_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value); - -/** - * @brief Get the 'pcie_bandwidth_inst' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_bandwidth_inst' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_bandwidth_inst_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value); - -/** - * @brief Get the 'pcie_l0_recov_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_l0_recov_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'pcie_replay_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_replay_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'pcie_replay_rover_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_replay_rover_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'curr_uclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_uclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] uclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value); - - -/****** - * Metric multi-valued counter types - */ - -/** - * @brief Get the 'temp_hbm' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_hbm' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] temp_hbm_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_NUM_HBM_INSTANCES) - * element array (GPUMetricTempHbm_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value); - -/** - * @brief Get the 'vcn_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'vcn_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS) - * element array (GPUMetricVcnActivity_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value); - -/** - * @brief Get the 'xgmi_read_data' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'xgmi_read_data' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_read_data_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) - * element array (GPUMetricXgmiReadDataAcc_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value); - -/** - * @brief Get the 'xgmi_write_data' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'xgmi_write_data' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_write_data_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) - * element array (GPUMetricXgmiWriteDataAcc_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value); - -/** - * @brief Get the 'curr_gfxclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'curr_gfxclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_gfxclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_GFX_CLKS) - * element array (GPUMetricCurrGfxClk_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value); - -/** - * @brief Get the 'curr_socclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_socclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_socclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrSocClk_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value); - -/** - * @brief Get the 'curr_vclk0' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_vclk0' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrVClk0_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value); - -/** - * @brief Get the 'curr_dclk0' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_dclk0' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrDClk0_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value); - -/** - * @brief Get the 'temp_edge' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_edge' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] edge_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value); - -/** - * @brief Get the 'temp_vrgfx' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrgfx' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrgfx_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value); - -/** - * @brief Get the 'temp_vrmem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrmem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrmem_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value); - -/** - * @brief Get the 'avg_mm_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_mm_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mm_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value); - -/** - * @brief Get the 'curr_vclk1' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_vclk1' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value); - -/** - * @brief Get the 'curr_dclk1' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_dclk1' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value); - -/** - * @brief Get the 'indep_throttle_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'indep_throttle_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] throttle_status_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value); - -/** - * @brief Get the 'avg_socket_power' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_socket_power' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); - -/** - * @brief Get the 'curr_fan_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_fan_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] fan_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value); - -/** - * @brief Get the 'avg_gfx_clock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_gfx_clock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_soc_clock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_soc_clock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_uclock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_uclock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_vclock0_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_vclock0_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_dclock0_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_dclock0_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_vclock1_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_vclock1_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_dclock1_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_dclock1_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'volt_soc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_soc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value); - -/** - * @brief Get the 'volt_gfx' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_gfx' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value); - -/** - * @brief Get the 'volt_mem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_mem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value); - /** * @brief Get the 'metrics_header_info' from the GPU metrics associated with the device * @@ -5896,6 +5048,7 @@ rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value); rsmi_status_t rsmi_dev_metrics_log_get(uint32_t dv_ind); +/** @} */ // end of DevMetricsHeaderInfoGet #ifdef __cplusplus } diff --git a/rocm_smi/include/rocm_smi/rocm_smi_binary_parser.h b/rocm_smi/include/rocm_smi/rocm_smi_binary_parser.h index 6c48f5e7b5..4fa3aaa7f4 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_binary_parser.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_binary_parser.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_common.h b/rocm_smi/include/rocm_smi/rocm_smi_common.h old mode 100755 new mode 100644 index f29e427789..601e2255f9 --- a/rocm_smi/include/rocm_smi/rocm_smi_common.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_common.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_counters.h b/rocm_smi/include/rocm_smi/rocm_smi_counters.h old mode 100755 new mode 100644 index 091c89d90d..e3ada60097 --- a/rocm_smi/include/rocm_smi/rocm_smi_counters.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_counters.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h old mode 100755 new mode 100644 index 426a9ad017..72fbdd8a96 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_exception.h b/rocm_smi/include/rocm_smi/rocm_smi_exception.h old mode 100755 new mode 100644 index 7c898fb958..847bed37db --- a/rocm_smi/include/rocm_smi/rocm_smi_exception.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_exception.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index b6cccdc6a2..70067b10ae 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_io_link.h b/rocm_smi/include/rocm_smi/rocm_smi_io_link.h old mode 100755 new mode 100644 index 191d5c96f2..e7bc35ebc2 --- a/rocm_smi/include/rocm_smi/rocm_smi_io_link.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_io_link.h @@ -85,7 +85,8 @@ typedef enum _LINK_DIRECTORY_TYPE { class IOLink { public: explicit IOLink(uint32_t node_indx, uint32_t link_indx, LINK_DIRECTORY_TYPE link_dir_type) : - node_indx_(node_indx), link_indx_(link_indx), link_dir_type_(link_dir_type) {} + node_indx_(node_indx), link_indx_(link_indx), link_dir_type_(link_dir_type), + link_cap_{UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX} {} ~IOLink(); int Initialize(); @@ -96,23 +97,28 @@ class IOLink { IO_LINK_TYPE type(void) const {return type_;} uint32_t node_from(void) const {return node_from_;} uint32_t node_to(void) const {return node_to_;} + uint32_t flag(void) const {return flags_;} uint64_t weight(void) const {return weight_;} LINK_DIRECTORY_TYPE get_directory_type(void) const {return link_dir_type_;} uint64_t min_bandwidth(void) const {return min_bandwidth_;} uint64_t max_bandwidth(void) const {return max_bandwidth_;} + const rsmi_p2p_capability_t& get_link_capability(void) const {return link_cap_;} - + protected: + virtual int UpdateP2pCapability(void); private: uint32_t node_indx_; uint32_t link_indx_; IO_LINK_TYPE type_; uint32_t node_from_; uint32_t node_to_; + uint32_t flags_; uint64_t weight_; uint64_t min_bandwidth_; uint64_t max_bandwidth_; std::map properties_; LINK_DIRECTORY_TYPE link_dir_type_; + rsmi_p2p_capability_t link_cap_; }; int diff --git a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h old mode 100755 new mode 100644 index 81a76400ce..f0d981b593 --- a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -94,6 +94,11 @@ class KFDNode { int32_t get_simd_per_cu(uint64_t* simd_per_cu) const; int32_t get_simd_count(uint64_t* simd_count) const; + // Get gpu_id (AKA GUID) version from kfd + int get_gpu_id(uint64_t *gpu_id); + // Get node id from kfd + int get_node_id(uint32_t *node_id); + private: uint32_t node_indx_; uint32_t amdgpu_dev_index_; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_logger.h b/rocm_smi/include/rocm_smi/rocm_smi_logger.h index f83240fbf4..d42d795a28 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_logger.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_logger.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_main.h b/rocm_smi/include/rocm_smi/rocm_smi_main.h old mode 100755 new mode 100644 index 0a66ea227c..73ee8d8bbf --- a/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_monitor.h b/rocm_smi/include/rocm_smi/rocm_smi_monitor.h old mode 100755 new mode 100644 index ad284646b3..1dbfbf6e81 --- a/rocm_smi/include/rocm_smi/rocm_smi_monitor.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_monitor.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_power_mon.h b/rocm_smi/include/rocm_smi/rocm_smi_power_mon.h old mode 100755 new mode 100644 index 71e4c0865e..122eb19a73 --- a/rocm_smi/include/rocm_smi/rocm_smi_power_mon.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_power_mon.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_properties.h b/rocm_smi/include/rocm_smi/rocm_smi_properties.h index 67d285cbbc..e260f82006 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_properties.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_properties.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h old mode 100755 new mode 100644 index 36261d89e6..ed560ce8ed --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -48,8 +48,11 @@ #include #include #include +#include #include +#include #include +#include #include #include #include @@ -594,6 +597,7 @@ class TagTextContents_t } } } + }; using TextFileTagContents_t = TagTextContents_t: <|*>" @@ -783,7 +783,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, } ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << amd::smi::getRSMIStatusString(ret);; + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); return ret; CATCH @@ -806,16 +806,30 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { kfd_node->get_property_value("domain", &domain); - // Replace the 16 bit domain originally set like this: - // BDFID = (( & 0xffff) << 32) | (( & 0xff) << 8) | - // ((device& 0x1f) <<3 ) | (function & 0x7) - // with this: - // BDFID = (( & 0xffffffff) << 32) | (( & 0xff) << 8) | - // ((device& 0x1f) <<3 ) | (function & 0x7) - + /** + * Add domain to full pci_id: + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | + * ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ assert((domain & 0xFFFFFFFF00000000) == 0); - (*bdfid) &= 0xFFFF; // Clear out the old 16 bit domain - *bdfid |= (domain & 0xFFFFFFFF) << 32; + (*bdfid) &= 0xFFFFFFFF; // keep bottom 32 bits of pci_id + *bdfid |= (domain & 0xFFFFFFFF) << 32; // Add domain to top of pci_id + uint64_t pci_id = *bdfid; + uint32_t node = UINT32_MAX; + rsmi_dev_node_id_get(dv_ind, &node); + ss << __PRETTY_FUNCTION__ << " | kfd node = " + << std::to_string(node) << "\n" + << " returning pci_id = " + << std::to_string(pci_id) << " (" + << amd::smi::print_int_as_hex(pci_id) << ")"; + LOG_INFO(ss); ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", reporting RSMI_STATUS_SUCCESS"; @@ -957,7 +971,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { } rsmi_status_t -rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) { +rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { std::ostringstream ss; rsmi_status_t ret; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; @@ -1561,6 +1575,7 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, if (ret != RSMI_STATUS_SUCCESS) { return ret; } + // For clock frequency setting, enter a new value by writing a string that // contains "s/m index clock" to the file. The index should be 0 if to set // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update @@ -1585,7 +1600,6 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, CATCH } - rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, uint64_t maxclkvalue, rsmi_clk_type_t clkType) { @@ -2067,7 +2081,8 @@ rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, // the enforce_isolation sysfs is in this format // Get the partition_id. For SPX, the partition_id will be 0. - int partition_id = dev->get_partition_id(); + uint32_t partition_id = 0; + rsmi_dev_partition_id_get(dv_ind, &partition_id); DEVICE_MUTEX @@ -2126,7 +2141,8 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, // To set the values,need to specify the setting for all of the partitions // For two partition // echo "1 0" | sudo tee  /sys/class/drm/cardX/device/enforce_isolation - int partition_id = dev->get_partition_id(); + uint32_t partition_id = 0; + rsmi_dev_partition_id_get(dv_ind, &partition_id); std::string str_val; rsmi_status_t ret = get_dev_value_line(amd::smi::kDevProcessIsolation, dv_ind, &str_val); if (ret == RSMI_STATUS_FILE_ERROR) { @@ -5285,9 +5301,87 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, CATCH } -static rsmi_status_t -get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { +rsmi_status_t +rsmi_topo_get_p2p_status(uint32_t dv_ind_src, uint32_t dv_ind_dst, + RSMI_IO_LINK_TYPE *type, rsmi_p2p_capability_t *cap) { TRY + + uint32_t dv_ind = dv_ind_src; + GET_DEV_AND_KFDNODE_FROM_INDX + DEVICE_MUTEX + + if (type == nullptr || cap == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + // If source device is same as destination, return invalid args + if (dv_ind_src == dv_ind_dst) { + return RSMI_STATUS_INVALID_ARGS; + } + + uint32_t node_ind_src, node_ind_dst; + // Fetch the source and destination node index + if (smi.get_node_index(dv_ind_src, &node_ind_src) || + smi.get_node_index(dv_ind_dst, &node_ind_dst)) { + return RSMI_STATUS_INVALID_ARGS; + } + + bool node_is_find = false; + std::map> io_link_map_tmp; + std::map>::iterator it; + // Iterate over P2P links + if (DiscoverP2PLinksPerNode(node_ind_src, &io_link_map_tmp) == 0) { + for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++) { + if (it->first == node_ind_dst) { + node_is_find = true; + break; + } + } + io_link_map_tmp.clear(); + } else { + return RSMI_STATUS_FILE_ERROR; + } + + if (!node_is_find) { + // Iterate over IO links + if (DiscoverIOLinksPerNode(node_ind_src, &io_link_map_tmp) == 0) { + for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++) { + if (it->first == node_ind_dst) { + node_is_find = true; + break; + } + } + io_link_map_tmp.clear(); + } else { + return RSMI_STATUS_FILE_ERROR; + } + } + + if (node_is_find) { + amd::smi::IO_LINK_TYPE io_link_type = it->second->type(); + if (io_link_type == amd::smi::IOLINK_TYPE_PCIEXPRESS) { + *type = RSMI_IOLINK_TYPE_PCIEXPRESS; + } else if (io_link_type == amd::smi::IOLINK_TYPE_XGMI) { + *type = RSMI_IOLINK_TYPE_XGMI; + } else { + // Unexpected IO Link type read + return RSMI_STATUS_NOT_SUPPORTED; + } + *cap = it->second->get_link_capability(); + return RSMI_STATUS_SUCCESS; + } + + return RSMI_STATUS_NOT_SUPPORTED; + + CATCH +} + +static rsmi_status_t get_compute_partition(uint32_t dv_ind, + std::string &compute_partition) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) std::string compute_partition_str; @@ -5311,6 +5405,8 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { return RSMI_STATUS_UNEXPECTED_DATA; } compute_partition = compute_partition_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -5320,7 +5416,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind = " + ss << __PRETTY_FUNCTION__ << " | ======= start =======, dv_ind = " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (compute_partition == nullptr)) { @@ -5356,7 +5452,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, return ret; } - std::size_t length = returning_compute_partition.copy(compute_partition, len); + std::size_t length = returning_compute_partition.copy(compute_partition, len-1); compute_partition[length]='\0'; if (len < (returning_compute_partition.size() + 1)) { @@ -5390,20 +5486,47 @@ static rsmi_status_t is_available_compute_partition(uint32_t dv_ind, std::string new_compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); DEVICE_MUTEX std::string availableComputePartitions; rsmi_status_t ret = get_dev_value_line(amd::smi::kDevAvailableComputePartition, dv_ind, &availableComputePartitions); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); return ret; } bool isComputePartitionAvailable = amd::smi::containsString(availableComputePartitions, new_compute_partition); - return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : - RSMI_STATUS_SETTING_UNAVAILABLE; + + ret = ((isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : + RSMI_STATUS_SETTING_UNAVAILABLE); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: available_partitions = " << availableComputePartitions + << " | Data: isComputePartitionAvailable = " + << (isComputePartitionAvailable ? "True" : "False") + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_INFO(ss); + return ret; CATCH } @@ -5412,16 +5535,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS if (!amd::smi::is_sudo_user()) { return RSMI_STATUS_PERMISSION; } - DEVICE_MUTEX - std::string newComputePartitionStr - = mapRSMIToStringComputePartitionTypes.at(compute_partition); - std::string currentComputePartition; + std::string currentComputePartition = ""; + std::string newComputePartitionStr = ""; switch (compute_partition) { case RSMI_COMPUTE_PARTITION_CPX: @@ -5429,9 +5550,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, case RSMI_COMPUTE_PARTITION_DPX: case RSMI_COMPUTE_PARTITION_TPX: case RSMI_COMPUTE_PARTITION_QPX: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(compute_partition); break; case RSMI_COMPUTE_PARTITION_INVALID: default: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(RSMI_COMPUTE_PARTITION_INVALID); ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " @@ -5508,8 +5633,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << "| sizeof string = " << std::dec << sizeof(newComputePartitionStr); LOG_DEBUG(ss); - GET_DEV_FROM_INDX + DEVICE_MUTEX int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, newComputePartitionStr); rsmi_status_t returnResponse = amd::smi::ErrnoToRsmiStatus(ret); @@ -5524,7 +5649,6 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << getRSMIStatusString(returnResponse) << " |"; LOG_TRACE(ss); - // TODO(charpoag): investigate providing GPU busy state occurred with return returnResponse; CATCH } @@ -5532,6 +5656,9 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, static rsmi_status_t get_memory_partition(uint32_t dv_ind, std::string &memory_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(memory_partition.c_str()) std::string val_str; @@ -5555,6 +5682,8 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind, return RSMI_STATUS_UNEXPECTED_DATA; } memory_partition = val_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -5564,7 +5693,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, rsmi_memory_partition_type_t memory_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5699,7 +5828,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (memory_partition == nullptr)) { ss << __PRETTY_FUNCTION__ @@ -5769,7 +5898,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5808,7 +5937,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5844,6 +5973,168 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { CATCH } +rsmi_status_t +rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; + LOG_TRACE(ss); + if (partition_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: nullptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + DEVICE_MUTEX + std::string strCompPartition = "UNKNOWN"; + const uint32_t PARTITION_LEN = 10; + char compute_partition[PARTITION_LEN]; + rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN); + if (ret == RSMI_STATUS_SUCCESS) { + strCompPartition = compute_partition; + } + uint64_t pci_id = UINT64_MAX; + *partition_id = UINT32_MAX; + ret = rsmi_dev_pci_id_get(dv_ind, &pci_id); + if (ret == RSMI_STATUS_SUCCESS) { + *partition_id = static_cast((pci_id >> 28) & 0xf); + } + + /** + * Fall back is required due to driver changes within KFD. + * Some devices may report bits [31:28] or [2:0]. + * With the newly added rsmi_dev_partition_id_get(..), + * we provided this fallback to properly retrieve the partition ID. We + * plan to eventually remove partition ID from the function portion of the + * BDF (Bus Device Function). See below for PCI ID description. + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ + if (*partition_id != UINT32_MAX && *partition_id == 0 && + (strCompPartition == "DPX" || strCompPartition == "TPX" + || strCompPartition == "CPX" || strCompPartition == "QPX")) { + *partition_id = static_cast(pci_id & 0x7); + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: " << *partition_id + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind, + uint64_t *gfx_version) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + std::string version = ""; + const uint64_t undefined_gfx_version = std::numeric_limits::max(); + if (gfx_version == nullptr) { + ret = RSMI_STATUS_INVALID_ARGS; + } else { + *gfx_version = undefined_gfx_version; + ret = amd::smi::rsmi_get_gfx_target_version(dv_ind , &version); + } + if (ret == RSMI_STATUS_SUCCESS) { + version = amd::smi::removeString(version, "gfx"); + *gfx_version = std::stoull(version); + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(ret, false) + << " | Device #: " << dv_ind + << " | Type: Target_graphics_version" + << " | Data: " + << ((gfx_version == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*gfx_version)); + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + GET_DEV_AND_KFDNODE_FROM_INDX + uint64_t kgd_gpu_id = 0; + rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED; + int ret = kfd_node->KFDNode::get_gpu_id(&kgd_gpu_id); + resp = amd::smi::ErrnoToRsmiStatus(ret); + + if (guid == nullptr) { + resp = RSMI_STATUS_INVALID_ARGS; + } else { + *guid = kgd_gpu_id; + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(resp, false) + << " | Device #: " << dv_ind + << " | Type: GUID (gpu_id)" + << " | Data: " << ((guid == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*guid)); + LOG_INFO(ss); + return resp; + CATCH +} + +rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + GET_DEV_AND_KFDNODE_FROM_INDX + uint32_t kfd_node_id = std::numeric_limits::max(); + rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED; + int ret = kfd_node->KFDNode::get_node_id(&kfd_node_id); + resp = amd::smi::ErrnoToRsmiStatus(ret); + + if (node_id == nullptr) { + resp = RSMI_STATUS_INVALID_ARGS; + } else { + *node_id = kfd_node_id; + if (kfd_node_id == std::numeric_limits::max()) { + resp = RSMI_STATUS_NOT_SUPPORTED; + } + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(resp, false) + << " | Device #: " << dv_ind + << " | Type: node_id" + << " | Data: " << ((node_id == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*node_id)); + LOG_INFO(ss); + return resp; + CATCH +} + enum iterator_handle_type { FUNC_ITER = 0, VARIANT_ITER, @@ -6304,1455 +6595,6 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { CATCH } -// -// NOTE: APIs related to new 'GPU Metrics' related work are added here -// so they can be used/tested. -// -rsmi_status_t -rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(edge_value != nullptr); - if (edge_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempEdge); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *edge_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(hotspot_value != nullptr); - if (hotspot_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHotspot); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *hotspot_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mem_value != nullptr); - if (mem_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrgfx_value != nullptr); - if (vrgfx_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrGfx); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrgfx_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrsoc_value != nullptr); - if (vrsoc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrSoc); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrsoc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrmem_value != nullptr); - if (vrmem_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrmem_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(socket_power_value != nullptr); - if (socket_power_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(socket_power_value != nullptr); - if (socket_power_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfx_activity_value != nullptr); - if (gfx_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(umc_activity_value != nullptr); - if (umc_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *umc_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mm_activity_value != nullptr); - if (mm_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mm_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(energy_acc_value != nullptr); - if (energy_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *energy_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(system_clock_counter_value != nullptr); - if (system_clock_counter_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSClockCounter); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *system_clock_counter_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(firmware_timestamp_value != nullptr); - if (firmware_timestamp_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSFirmware); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *firmware_timestamp_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(throttle_status_value != nullptr); - if (throttle_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(throttle_status_value != nullptr); - if (throttle_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricThrottleStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(fan_speed_value != nullptr); - if (fan_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *fan_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_link_width_value != nullptr); - if (pcie_link_width_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_width_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_link_speed_value != nullptr); - if (pcie_link_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_link_width_value != nullptr); - if (xgmi_link_width_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_width_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_link_speed_value != nullptr); - if (xgmi_link_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfxclk_lock_status_value != nullptr); - if (gfxclk_lock_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfxclk_lock_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfx_activity_acc_value != nullptr); - if (gfx_activity_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mem_activity_acc_value != nullptr); - if (mem_activity_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_activity_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_bandwidth_acc_value != nullptr); - if (pcie_bandwidth_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_bandwidth_inst_value != nullptr); - if (pcie_bandwidth_inst_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_inst_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(uclk_value != nullptr); - if (uclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrUClock); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *uclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(temp_hbm_value != nullptr); - if (temp_hbm_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); - amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); - const auto max_num_elems = - static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_hbl_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(temp_hbm_value, 0, sizeof(*temp_hbm_value)); - std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vcn_activity_value != nullptr); - if (vcn_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); - const auto max_num_elems = - static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_vcn_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(vcn_activity_value, 0, sizeof(*vcn_activity_value)); - std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_read_data_acc_value != nullptr); - if (xgmi_read_data_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - const auto max_num_elems = - static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(xgmi_read_data_acc_value, 0, sizeof(*xgmi_read_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_write_data_acc_value != nullptr); - if (xgmi_write_data_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - const auto max_num_elems = - static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(xgmi_write_data_acc_value, 0, sizeof(*xgmi_write_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_gfxclk_value != nullptr); - if (current_gfxclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); - amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); - const auto max_num_elems = - static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_gfxclk_value, 0, sizeof(*current_gfxclk_value)); - std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_socclk_value != nullptr); - if (current_socclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); - amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); - const auto max_num_elems = - static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_socclk_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_socclk_value, 0, sizeof(*current_socclk_value)); - std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_vclk_value != nullptr); - if (current_vclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); - amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); - const auto max_num_elems = - static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_vclk0_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_vclk_value, 0, sizeof(*current_vclk_value)); - std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_vclk_value != nullptr); - if (current_vclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock1); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_vclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_dclk_value != nullptr); - if (current_dclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0); - amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); - const auto max_num_elems = - static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_dclk_value, 0, sizeof(*current_dclk_value)); - std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_dclk_value != nullptr); - if (current_dclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock1); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_dclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageSoc); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageGfx); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - rsmi_status_t rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header_value) { @@ -7796,10 +6638,13 @@ rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value) } auto xcd_counter = uint16_t(0); - GPUMetricCurrGfxClk_t curr_gfxclk_table{}; - auto status_code = rsmi_dev_metrics_curr_gfxclk_get(dv_ind, &curr_gfxclk_table); + rsmi_gpu_metrics_t gpu_metrics; + auto status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - for (const auto& gfxclk : curr_gfxclk_table) { + for (const auto& gfxclk : gpu_metrics.current_gfxclks) { + if (gfxclk == UINT16_MAX) { + break; + } if ((gfxclk != 0) && (gfxclk != UINT16_MAX)) { xcd_counter++; } @@ -7841,10 +6686,6 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind) CATCH } -// -// End of: new GPU Metrics related work. -// - // UNDOCUMENTED FUNCTIONS // This functions are not declared in rocm_smi.h. They are either not fully diff --git a/rocm_smi/src/rocm_smi64Config.in b/rocm_smi/src/rocm_smi64Config.in old mode 100755 new mode 100644 index a3b26311d5..c593bc9fcc --- a/rocm_smi/src/rocm_smi64Config.in +++ b/rocm_smi/src/rocm_smi64Config.in @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_binary_parser.cc b/rocm_smi/src/rocm_smi_binary_parser.cc old mode 100755 new mode 100644 index 805be841c2..be4569bb2a --- a/rocm_smi/src/rocm_smi_binary_parser.cc +++ b/rocm_smi/src/rocm_smi_binary_parser.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_counters.cc b/rocm_smi/src/rocm_smi_counters.cc old mode 100755 new mode 100644 index a08819568e..2f7e60c13d --- a/rocm_smi/src/rocm_smi_counters.cc +++ b/rocm_smi/src/rocm_smi_counters.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc old mode 100755 new mode 100644 index e0ebe8a055..c32c81f156 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -490,7 +490,7 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, - {"rsmi_dev_oam_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, + {"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_name_get", {{kDevVendorIDFName, diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc old mode 100755 new mode 100644 index f6d7f80e5f..3bc078216d --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -453,7 +453,7 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str value = (metric); } - auto amdgpu_dynamic_metric_value = [&]() { + auto amdgpu_dynamic_metric_value = [&, data_type=data_type]() { AMDGpuDynamicMetricsValue_t amdgpu_dynamic_metric_value_init{}; amdgpu_dynamic_metric_value_init.m_value = value; amdgpu_dynamic_metric_value_init.m_info = (value_title + " : " + std::to_string(idx)); diff --git a/rocm_smi/src/rocm_smi_io_link.cc b/rocm_smi/src/rocm_smi_io_link.cc old mode 100755 new mode 100644 index da3795fafa..93b4ae238a --- a/rocm_smi/src/rocm_smi_io_link.cc +++ b/rocm_smi/src/rocm_smi_io_link.cc @@ -57,6 +57,15 @@ #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_io_link.h" + +#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +#define CRAT_IOLINK_FLAGS_BI_DIRECTIONAL (1 << 31) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0x7fffffe0 + namespace amd { namespace smi { @@ -76,7 +85,7 @@ static const char *kIOLinkPropMIN_BANDWIDTHStr = "min_bandwidth"; static const char *kIOLinkPropMAX_BANDWIDTHStr = "max_bandwidth"; // static const char *kIOLinkPropRECOMMENDED_TRANSFER_SIZEStr = // "recommended_transfer_size"; -// static const char *kIOLinkPropFLAGSStr = "flags"; +static const char *kIOLinkPropFLAGSStr = "flags"; static bool is_number(const std::string &s) { return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit); @@ -380,6 +389,12 @@ IOLink::Initialize(void) { ret = get_property_value(kIOLinkPropWEIGHTStr, &weight_); if (ret) {return ret;} + ret = get_property_value(kIOLinkPropFLAGSStr, reinterpret_cast(&flags_)); + if (ret) {return ret;} + + ret = UpdateP2pCapability(); + if (ret) {return ret;} + ret = get_property_value(kIOLinkPropMIN_BANDWIDTHStr, &min_bandwidth_); if (ret) {return ret;} @@ -401,5 +416,31 @@ IOLink::get_property_value(std::string property, uint64_t *value) { return 0; } +int IOLink::UpdateP2pCapability(void) { + const uint8_t cap_true = 1; + const uint8_t cap_false = 0; + + if (!(flags_ & CRAT_IOLINK_FLAGS_ENABLED)) { + return 0; + } + + link_cap_.is_iolink_coherent = + (flags_ & CRAT_IOLINK_FLAGS_NON_COHERENT) ? cap_false : cap_true; + + link_cap_.is_iolink_atomics_32bit = + (flags_ & CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT) ? cap_false : cap_true; + + link_cap_.is_iolink_atomics_64bit = + (flags_ & CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT) ? cap_false : cap_true; + + link_cap_.is_iolink_bi_directional = + (flags_ & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL) ? cap_true : cap_false; + + link_cap_.is_iolink_dma = + (flags_ & CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA) ? cap_false : cap_true; + + return 0; +} + } // namespace smi } // namespace amd diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc old mode 100755 new mode 100644 index b319a1fcd4..df421a3d9c --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -434,6 +434,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return 0; } +static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){ + if(sysfs_ret==0 && !is_number(s)){ + return EINVAL; + } + return sysfs_ret; +} + int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, std::unordered_set *gpu_set) { assert(proc != nullptr); @@ -483,30 +490,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, vram_str_path += std::to_string(gpu_id); err = ReadSysfsStr(vram_str_path, &tmp); - if (err) { - return err; - } + auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + // Report all errors, except ENOENT (2), which should be ignored + // and the proc->vram_usage should be unmodified + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + // Do not store any invalid values + else if (sysfs_data_errcode == 0) { + proc->vram_usage += std::stoull(tmp); } - - proc->vram_usage += std::stoull(tmp); std::string sdma_str_path = proc_str_path; sdma_str_path += "/sdma_"; sdma_str_path += std::to_string(gpu_id); err = ReadSysfsStr(sdma_str_path, &tmp); - if (err) { - return err; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if (sysfs_data_errcode == 0) { + proc->sdma_usage += std::stoull(tmp); } - - proc->sdma_usage += std::stoull(tmp); // Build the path and read from Sysfs file, info that // encodes Compute Unit usage by a process of interest @@ -516,17 +524,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, cu_occupancy_path += "/cu_occupancy"; err = ReadSysfsStr(cu_occupancy_path, &tmp); - if (err == 0) { - if (!is_number(tmp)) { - return EINVAL; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if(sysfs_data_errcode==0){ // Update CU usage by the process proc->cu_occupancy += std::stoi(tmp); - // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); - } else { - //Some GFX revisions do not provide cu_occupancy debugfs method + } + else { + // Some GFX revisions do not provide cu_occupancy debugfs method + // which may cause ENOENT proc->cu_occupancy = CU_OCCUPANCY_INVALID; cu_count = 0; } @@ -1067,18 +1078,18 @@ int KFDNode::get_gfx_target_version(uint64_t *gfx_target_version) { *gfx_target_version = gfx_version; ss << __PRETTY_FUNCTION__ << " | File: " << properties_path - << " | Successfully read node #" << std::to_string(this->node_indx_) + << " | Read node: " << std::to_string(this->node_indx_) << " for gfx_target_version" - << " | Data (gfx_target_version) *gfx_target_version = " + << " | Data (*gfx_target_version): " << std::to_string(*gfx_target_version) - << " | return = " << std::to_string(ret) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) << " | "; LOG_DEBUG(ss); return ret; } -int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const -{ +int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const { const std::string properties_path("/sys/class/kfd/kfd/topology/nodes/" + std::to_string(this->node_indx_) + "/properties"); @@ -1090,8 +1101,7 @@ int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const return ret; } -int32_t KFDNode::get_simd_count(uint64_t* simd_count) const -{ +int32_t KFDNode::get_simd_count(uint64_t* simd_count) const { const std::string properties_path("/sys/class/kfd/kfd/topology/nodes/" + std::to_string(this->node_indx_) + "/properties"); @@ -1103,6 +1113,62 @@ int32_t KFDNode::get_simd_count(uint64_t* simd_count) const return ret; } +// Public interface for device +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int KFDNode::get_gpu_id(uint64_t *gpu_id) { + std::ostringstream ss; + std::string gpuid_path = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(this->node_indx_) + "/gpu_id"; + const uint64_t undefined_gpu_id = std::numeric_limits::max(); + std::string gpu_id_string = ""; + *gpu_id = undefined_gpu_id; + int ret = ReadSysfsStr(gpuid_path, &gpu_id_string); + if (ret != 0 || gpu_id_string.empty()) { + ss << __PRETTY_FUNCTION__ + << " | File: " << gpuid_path + << " | Data (*gpu_id): empty or nullptr" + << " | Issue: Could not read node #" << std::to_string(this->node_indx_) + << ". KFD node was an unsupported node or value read was empty." + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + LOG_ERROR(ss); + return ret; + } + *gpu_id = std::stoull(gpu_id_string); + if (*gpu_id == 0) { // CPU node - return not supported + *gpu_id = undefined_gpu_id; + ret = ENOENT; // map to RSMI_STATUS_NOT_SUPPORTED + } + ss << __PRETTY_FUNCTION__ + << " | File: " << gpuid_path + << " | Read node #: " << std::to_string(this->node_indx_) + << " | Data (*gpu_id): " << std::to_string(*gpu_id) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + LOG_DEBUG(ss); + return ret; +} + +// Public interface for device +// /sys/class/kfd/kfd/topology/nodes/ +int KFDNode::get_node_id(uint32_t *node_id) { + std::ostringstream ss; + int ret = 0; + std::string nodeid_path = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(this->node_indx_); + ss << __PRETTY_FUNCTION__ + << " | File: " << nodeid_path + << " | Read node #: " << std::to_string(this->node_indx_) + << " | Data (*node_id): " << std::to_string(*node_id) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + *node_id = this->node_indx_; + LOG_DEBUG(ss); + return ret; +} } // namespace smi } // namespace amd diff --git a/rocm_smi/src/rocm_smi_logger.cc b/rocm_smi/src/rocm_smi_logger.cc index 593d4ff3e5..d4951769c1 100644 --- a/rocm_smi/src/rocm_smi_logger.cc +++ b/rocm_smi/src/rocm_smi_logger.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc old mode 100755 new mode 100644 index 25c32b1f94..42b778f233 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -2,7 +2,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -235,15 +235,7 @@ RocmSMI::Initialize(uint64_t flags) { int i_ret; std::ostringstream ss; - LOG_ALWAYS("=============== ROCM SMI initialize ================"); - ROCmLogging::Logger::getInstance()->enableAllLogLevels(); - // Leaving below to allow developers to check current log settings - // std::string logSettings = Logger::getInstance()->getLogSettings(); - // std::cout << "Current log settings:\n" << logSettings << std::endl; - if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { - logSystemDetails(); - } assert(ref_count_ == 1); if (ref_count_ != 1) { @@ -259,6 +251,15 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // debugRSMIEnvVarInfo(); + if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { + ROCmLogging::Logger::getInstance()->enableAllLogLevels(); + LOG_ALWAYS("=============== ROCM SMI initialize ================"); + logSystemDetails(); + } + // Leaving below to allow developers to check current log settings + // std::string logSettings = ROCmLogging::Logger::getInstance()->getLogSettings(); + // std::cout << "Current log settings:\n" << logSettings << std::endl; + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; @@ -283,6 +284,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [before] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | (xgmi node) setting to setting " << "device->set_bdfid(device->bdfid())"; LOG_TRACE(ss); @@ -293,6 +295,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [before] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)"; LOG_TRACE(ss); device->set_bdfid(bdfid); @@ -301,6 +304,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [after] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | final update: device->bdfid() holds correct device bdf"; LOG_TRACE(ss); } @@ -312,8 +316,11 @@ RocmSMI::Initialize(uint64_t flags) { for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { dev = devices_[dv_ind]; uint64_t bdfid = dev->bdfid(); + bdfid = bdfid & 0xFFFFFFFF0FFFFFFF; // clear out partition id in bdf + // NOTE: partition_id is not part of bdf (but is part of pci_id) + // which is why it is removed in sorting dv_to_id.push_back({bdfid, dev}); - } + } ss << __PRETTY_FUNCTION__ << " Sort index based on BDF."; LOG_DEBUG(ss); @@ -734,7 +741,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { continue; sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId); if (cardId > max_cardId) - max_cardId = cardId; + max_cardId = cardId; count++; } dentry = readdir(drm_dir); @@ -748,23 +755,47 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint64_t s_gpu_id = 0; uint64_t s_unique_id = 0; uint64_t s_location_id = 0; + uint64_t s_bdf = 0; + uint64_t s_domain = 0; + uint8_t s_bus = 0; + uint8_t s_device = 0; + uint8_t s_function = 0; + uint8_t s_partition_id = 0; + uint64_t padding = 0; // padding added in case new changes in future }; // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id, - // location_id} + // location_id, bdf, domain, bus, device, + // partition_id} std::multimap allSystemNodes; uint32_t node_id = 0; + static const int BYTE = 8; while (true) { - uint64_t gpu_id = 0, unique_id = 0, location_id = 0; + uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0; int ret_gpu_id = get_gpu_id(node_id, &gpu_id); int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); int ret_loc_id = read_node_properties(node_id, "location_id", &location_id); - if (ret_gpu_id == 0 || ret_unique_id == 0 || ret_loc_id == 0) { + int ret_domain = + read_node_properties(node_id, "domain", &domain); + if (ret_gpu_id == 0 && + ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0)) { + // Do not try to build a node if one of these fields + // do not exist in KFD (0 as values okay) systemNode myNode; myNode.s_node_id = node_id; myNode.s_gpu_id = gpu_id; myNode.s_unique_id = unique_id; myNode.s_location_id = location_id; + myNode.s_domain = domain & 0xFFFFFFFF; + myNode.s_bdf = (myNode.s_domain << 32) | (myNode.s_location_id); + myNode.s_location_id = myNode.s_bdf; + myNode.s_bdf |= ((domain & 0xFFFFFFFF) << 32); + myNode.s_location_id = myNode.s_bdf; + myNode.s_domain = myNode.s_location_id >> 32; + myNode.s_bus = ((myNode.s_location_id >> 8) & 0xFF); + myNode.s_device = ((myNode.s_location_id >> 3) & 0x1F); + myNode.s_function = myNode.s_location_id & 0x7; + myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF); if (gpu_id != 0) { // only add gpu nodes, 0 = CPU allSystemNodes.emplace(unique_id, myNode); } @@ -780,6 +811,12 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) << "; location_id = " << std::to_string(i.second.s_location_id) + << "; bdf = " << print_int_as_hex(i.second.s_bdf) + << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) + << "; function = " << std::to_string(i.second.s_function) + << "; partition_id = " << std::to_string(i.second.s_partition_id) << "], "; } ss << "}"; @@ -817,13 +854,67 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { rsmi_status_t ret_unique_id = rsmi_dev_unique_id_get(cardAdded, &device_uuid); auto temp_numb_nodes = allSystemNodes.count(device_uuid); - auto it = allSystemNodes.lower_bound(device_uuid); - if (it != allSystemNodes.end() && doesDeviceSupportPartitions && temp_numb_nodes > 1 + auto primaryBdfId = + allSystemNodes.lower_bound(device_uuid)->second.s_location_id; + auto i = allSystemNodes.lower_bound(device_uuid); + if (doesDeviceSupportPartitions && temp_numb_nodes > 1 && ret_unique_id == RSMI_STATUS_SUCCESS) { - auto primaryBdfId = it->second.s_location_id; // helps identify xgmi nodes (secondary nodes) easier + ss << __PRETTY_FUNCTION__ << " | secondary node add ; " + << " BDF = " << std::to_string(primaryBdfId) + << " (" << print_int_as_hex(primaryBdfId) << ")"; + LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } + ss << __PRETTY_FUNCTION__ + << " | (secondary node add) B4 AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(i->second.s_node_id) + << "; gpu_id = " << std::to_string(i->second.s_gpu_id) + << "; unique_id = " << std::to_string(i->second.s_unique_id) + << "; location_id = " << std::to_string(i->second.s_location_id) + << "; bdf = " << print_int_as_hex(i->second.s_bdf) + << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id) + << "], "; + LOG_DEBUG(ss); AddToDeviceList(d_name, primaryBdfId); } else { + ss << __PRETTY_FUNCTION__ << " | primary node add ; " + << " BDF = " << std::to_string(UINT64_MAX); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } + LOG_DEBUG(ss); + ss << __PRETTY_FUNCTION__ + << " | (primary node add) After AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(i->second.s_node_id) + << "; gpu_id = " << std::to_string(i->second.s_gpu_id) + << "; unique_id = " << std::to_string(i->second.s_unique_id) + << "; location_id = " << std::to_string(i->second.s_location_id) + << "; bdf = " << print_int_as_hex(i->second.s_bdf) + << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id) + << "], "; + LOG_DEBUG(ss); AddToDeviceList(d_name, UINT64_MAX); } @@ -834,6 +925,12 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) << "; location_id = " << std::to_string(i.second.s_location_id) + << "; bdf = " << print_int_as_hex(i.second.s_bdf) + << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) + << "; function = " << std::to_string(i.second.s_function) + << "; partition_id = " << std::to_string(i.second.s_partition_id) << "], "; } ss << "}"; @@ -909,6 +1006,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { auto removalGpuId = it->second.s_gpu_id; auto removalUniqueId = it->second.s_unique_id; auto removalLocId = it->second.s_location_id; + auto removaldomain = it->second.s_domain; auto nodesErased = 1; primary_location_id = removalLocId; allSystemNodes.erase(it++); @@ -919,6 +1017,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(removalGpuId) << "; unique_id = " << std::to_string(removalUniqueId) << "; location_id = " << std::to_string(removalLocId) + << "; removaldomain = " << std::to_string(removaldomain) << "]"; LOG_DEBUG(ss); } @@ -926,15 +1025,34 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { break; } auto myBdfId = it->second.s_location_id; - AddToDeviceList(secNode, myBdfId); + ss << __PRETTY_FUNCTION__ << " | secondary node add #2; " + << " BDF = " << std::to_string(myBdfId) + << " (" << print_int_as_hex(myBdfId) << ")"; + LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && it->second.s_partition_id == 0) { + it->second.s_partition_id = it->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(it->second.s_function) + << "; partition_id = " << std::to_string(it->second.s_partition_id); + LOG_DEBUG(ss); + } ss << __PRETTY_FUNCTION__ - << "\nSECONDARY --> After adding new node; ERASING -> [node_id = " - << std::to_string(it->second.s_node_id) + << " | (secondary node add #2) B4 AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(it->second.s_node_id) << "; gpu_id = " << std::to_string(it->second.s_gpu_id) << "; unique_id = " << std::to_string(it->second.s_unique_id) << "; location_id = " << std::to_string(it->second.s_location_id) - << "]"; + << "; bdf = " << print_int_as_hex(it->second.s_bdf) + << "; domain = " << print_int_as_hex(it->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(it->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(it->second.s_device, true, BYTE) + << "; function = " << std::to_string(it->second.s_function) + << "; partition_id = " << std::to_string(it->second.s_partition_id) + << "], "; LOG_DEBUG(ss); + AddToDeviceList(secNode, myBdfId); allSystemNodes.erase(it++); numb_nodes--; cardAdded++; diff --git a/rocm_smi/src/rocm_smi_monitor.cc b/rocm_smi/src/rocm_smi_monitor.cc old mode 100755 new mode 100644 index d7d9f4d6dc..40d7e8e4ac --- a/rocm_smi/src/rocm_smi_monitor.cc +++ b/rocm_smi/src/rocm_smi_monitor.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_power_mon.cc b/rocm_smi/src/rocm_smi_power_mon.cc old mode 100755 new mode 100644 index 454851651b..92317720d5 --- a/rocm_smi/src/rocm_smi_power_mon.cc +++ b/rocm_smi/src/rocm_smi_power_mon.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc old mode 100755 new mode 100644 index 32f0209654..7c9ece9be1 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -1113,6 +1113,7 @@ static std::string print_pnt(rsmi_od_vddc_point_t *pt) { ss << "\t\t** Voltage: " << pt->voltage << " mV\n"; return ss.str(); } + static std::string pt_vddc_curve(rsmi_od_volt_curve *c) { std::ostringstream ss; if (c == nullptr) { @@ -1182,16 +1183,31 @@ bool is_sudo_user() { return isRunningWithSudo; } -rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, - std::string *gfx_version) { +// string output of gfx_ +rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version) { std::ostringstream ss; uint64_t kfd_gfx_version = 0; GET_DEV_AND_KFDNODE_FROM_INDX int ret = kfd_node->get_gfx_target_version(&kfd_gfx_version); + uint64_t orig_target_version = 0; + uint64_t major = 0; + uint64_t minor = 0; + uint64_t rev = 0; if (ret == 0) { - ss << "gfx" << kfd_gfx_version; - *gfx_version = ss.str(); + orig_target_version = std::stoull(std::to_string(kfd_gfx_version)); + // separate out parts -> put back into normal graphics version format + major = static_cast((orig_target_version / 10000) * 100); + minor = static_cast((orig_target_version % 10000 / 100) * 10); + if (minor == 0) major *= 10; // 0 as a minor is correct, but bump up by 10 + rev = static_cast(orig_target_version % 100); + *gfx_version = "gfx" + std::to_string(major + minor + rev); + ss << __PRETTY_FUNCTION__ + << " | " << std::dec << "kfd_target_version = " << orig_target_version + << "; major = " << major << "; minor = " << minor << "; rev = " + << rev << "\nReporting rsmi_get_gfx_target_version = " << *gfx_version + << "\n"; + LOG_INFO(ss); return RSMI_STATUS_SUCCESS; } else { *gfx_version = "Unknown"; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index c2650c8dcf..7b7eda3a2d 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -753,21 +753,61 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i // default to 0xffff as not supported info->oam_id = std::numeric_limits::max(); uint16_t tmp_oam_id = 0; - status = rsmi_wrapper(rsmi_dev_oam_id_get, processor_handle, &(tmp_oam_id)); + status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, &(tmp_oam_id)); info->oam_id = tmp_oam_id; // default to 0xffffffff as not supported info->num_of_compute_units = std::numeric_limits::max(); auto tmp_num_of_compute_units = uint32_t(0); status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, - &tmp_num_of_compute_units); + &(tmp_num_of_compute_units)); if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { info->num_of_compute_units = tmp_num_of_compute_units; } + // default to 0xffffffffffffffff as not supported + info->target_graphics_version = std::numeric_limits::max(); + auto tmp_target_gfx_version = uint64_t(0); + status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, + &(tmp_target_gfx_version)); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + info->target_graphics_version = tmp_target_gfx_version; + } + return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle, + amdsmi_kfd_info_t *info) { + AMDSMI_CHECK_INIT(); + + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_status_t status; + // default to 0xffffffffffffffff as not supported + info->kfd_id = std::numeric_limits::max(); + auto tmp_kfd_id = uint64_t(0); + status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, &(tmp_kfd_id)); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } else { + info->kfd_id = tmp_kfd_id; + } + + // default to 0xffffffff as not supported + info->node_id = std::numeric_limits::max(); + auto tmp_node_id = uint32_t(0); + status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, &(tmp_node_id)); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } else { + info->node_id = tmp_node_id; + } + + return AMDSMI_STATUS_SUCCESS; +} amdsmi_status_t amdsmi_get_gpu_subsystem_id(amdsmi_processor_handle processor_handle, uint16_t *id) { @@ -1053,6 +1093,26 @@ amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src, return amd::smi::rsmi_to_amdsmi_status(rstatus); } +amdsmi_status_t +amdsmi_topo_get_p2p_status(amdsmi_processor_handle processor_handle_src, + amdsmi_processor_handle processor_handle_dst, + amdsmi_io_link_type_t *type, amdsmi_p2p_capability_t *cap) { + AMDSMI_CHECK_INIT(); + + amd::smi::AMDSmiGPUDevice* src_device = nullptr; + amd::smi::AMDSmiGPUDevice* dst_device = nullptr; + amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + r = get_gpu_device_from_handle(processor_handle_dst, &dst_device); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + auto rstatus = rsmi_topo_get_p2p_status(src_device->get_gpu_id(), dst_device->get_gpu_id(), + reinterpret_cast(type), + reinterpret_cast(cap)); + return amd::smi::rsmi_to_amdsmi_status(rstatus); +} + // Compute Partition functions amdsmi_status_t amdsmi_get_gpu_compute_partition(amdsmi_processor_handle processor_handle, @@ -1099,6 +1159,24 @@ amdsmi_reset_gpu_memory_partition(amdsmi_processor_handle processor_handle) { return rsmi_wrapper(rsmi_dev_memory_partition_reset, processor_handle); } +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_t *profile, + uint32_t *partition_id) { + AMDSMI_CHECK_INIT(); + // TODO: also fill out profile later + // default to 0xffffffff if not supported + *partition_id = std::numeric_limits::max(); + auto tmp_partition_id = uint32_t(0); + + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &tmp_partition_id); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS){ + *partition_id = tmp_partition_id; + } + + return status; +} + // TODO(bliu) : other xgmi related information amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info) { @@ -1234,8 +1312,8 @@ void amdsmi_free_name_value_pairs(void *p) { amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, - uint32_t sensor_ind, - amdsmi_power_cap_info_t *info) { + uint32_t sensor_ind, + amdsmi_power_cap_info_t *info) { AMDSMI_CHECK_INIT(); if (info == nullptr) diff --git a/src/amd_smi/amd_smi_common.cc b/src/amd_smi/amd_smi_common.cc index 2751d8efb9..b745afdc41 100644 --- a/src/amd_smi/amd_smi_common.cc +++ b/src/amd_smi/amd_smi_common.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/amd_smi/amd_smi_drm.cc b/src/amd_smi/amd_smi_drm.cc index a17ed40843..35dbd11484 100644 --- a/src/amd_smi/amd_smi_drm.cc +++ b/src/amd_smi/amd_smi_drm.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -52,6 +52,8 @@ #include "amd_smi/impl/amd_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -173,10 +175,26 @@ amdsmi_status_t AMDSmiDrm::init() { } has_valid_fds = true; - bdf.function_number = device->businfo.pci->func; - bdf.device_number = device->businfo.pci->dev; - bdf.bus_number = device->businfo.pci->bus; - bdf.domain_number = device->businfo.pci->domain; + std::ostringstream ss; + uint64_t bdf_rocm = 0; + rsmi_dev_pci_id_get(i, &bdf_rocm); + ss << __PRETTY_FUNCTION__ << " | " + << "bdf_rocm | Received bdf: " + << "\nWhole BDF: " << amd::smi::print_unsigned_hex_and_int(bdf_rocm) + << "\nDomain = " + << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xFFFFFFFF00000000) >> 32) + << "; \nBus# = " << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xFF00) >> 8) + << "; \nDevice# = "<< amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xF8) >> 3) + << "; \nFunction# = " << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0x7)); + LOG_INFO(ss); + bdf.function_number = ((bdf_rocm & 0x7)); + bdf.device_number = ((bdf_rocm & 0xF8) >> 3); + bdf.bus_number = ((bdf_rocm & 0xFF00) >> 8); + bdf.domain_number = ((bdf_rocm & 0xFFFFFFFF00000000) >> 32); + ss << __PRETTY_FUNCTION__ << " | " << "Received bdf: Domain = " << bdf.domain_number + << "; Bus# = " << bdf.bus_number << "; Device# = "<< bdf.device_number + << "; Function# = " << bdf.function_number; + LOG_INFO(ss); vendor_id = device->deviceinfo.pci->vendor_id; @@ -309,6 +327,14 @@ amdsmi_status_t AMDSmiDrm::get_drm_fd_by_index(uint32_t gpu_index, uint32_t *fd_ amdsmi_status_t AMDSmiDrm::get_bdf_by_index(uint32_t gpu_index, amdsmi_bdf_t *bdf_info) const { if (gpu_index + 1 > drm_bdfs_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; *bdf_info = drm_bdfs_[gpu_index]; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | gpu_index = " << gpu_index + << "; \nreceived bdf: Domain = " << bdf_info->domain_number + << "; \nBus# = " << bdf_info->bus_number + << "; \nDevice# = " << bdf_info->device_number + << "; \nFunction# = " << bdf_info->function_number + << "\nReturning = AMDSMI_STATUS_SUCCESS"; + LOG_INFO(ss); return AMDSMI_STATUS_SUCCESS; } diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index 650227d9fd..0c648baeb9 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/amd_smi/amd_smi_lib_loader.cc b/src/amd_smi/amd_smi_lib_loader.cc index ad3dba1fd0..fc56640390 100644 --- a/src/amd_smi/amd_smi_lib_loader.cc +++ b/src/amd_smi/amd_smi_lib_loader.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/amd_smi/amd_smi_socket.cc b/src/amd_smi/amd_smi_socket.cc index 5f7029fc16..e21363d8fe 100644 --- a/src/amd_smi/amd_smi_socket.cc +++ b/src/amd_smi/amd_smi_socket.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index 3a1149b810..6ba65fc48f 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index 25e1a52235..018a358234 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -1,4 +1,4 @@ -/* * Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +/* * Copyright (C) 2024 Advanced Micro Devices. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in diff --git a/src/amd_smi/amd_smi_uuid.cc b/src/amd_smi/amd_smi_uuid.cc index d75844eb25..507b193ddb 100644 --- a/src/amd_smi/amd_smi_uuid.cc +++ b/src/amd_smi/amd_smi_uuid.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/amd_smi/fdinfo.cc b/src/amd_smi/fdinfo.cc index 29fd3f2c03..297ff39c8a 100644 --- a/src/amd_smi/fdinfo.cc +++ b/src/amd_smi/fdinfo.cc @@ -1,4 +1,4 @@ -/* * Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +/* * Copyright (C) 2024 Advanced Micro Devices. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in diff --git a/src/amd_smiConfig.in b/src/amd_smiConfig.in old mode 100755 new mode 100644 index 28fa592fb2..ceb68cace5 --- a/src/amd_smiConfig.in +++ b/src/amd_smiConfig.in @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/src/rocm_smi_properties.cc b/src/rocm_smi_properties.cc index d73f974286..affd255c30 100644 --- a/src/rocm_smi_properties.cc +++ b/src/rocm_smi_properties.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/api_support_read.cc b/tests/amd_smi_test/functional/api_support_read.cc old mode 100755 new mode 100644 index 78cf468af1..34bffe145e --- a/tests/amd_smi_test/functional/api_support_read.cc +++ b/tests/amd_smi_test/functional/api_support_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/api_support_read.h b/tests/amd_smi_test/functional/api_support_read.h old mode 100755 new mode 100644 index 6bdf3a89b6..f16b0e1ee5 --- a/tests/amd_smi_test/functional/api_support_read.h +++ b/tests/amd_smi_test/functional/api_support_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/err_cnt_read.cc b/tests/amd_smi_test/functional/err_cnt_read.cc old mode 100755 new mode 100644 index fa8b3676af..1e298816ba --- a/tests/amd_smi_test/functional/err_cnt_read.cc +++ b/tests/amd_smi_test/functional/err_cnt_read.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/err_cnt_read.h b/tests/amd_smi_test/functional/err_cnt_read.h old mode 100755 new mode 100644 index 701fde4e31..f7de2e8f5a --- a/tests/amd_smi_test/functional/err_cnt_read.h +++ b/tests/amd_smi_test/functional/err_cnt_read.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/evt_notif_read_write.cc b/tests/amd_smi_test/functional/evt_notif_read_write.cc old mode 100755 new mode 100644 index 47a2ae12eb..64e5276a81 --- a/tests/amd_smi_test/functional/evt_notif_read_write.cc +++ b/tests/amd_smi_test/functional/evt_notif_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/evt_notif_read_write.h b/tests/amd_smi_test/functional/evt_notif_read_write.h old mode 100755 new mode 100644 index b1e0271007..16154f6304 --- a/tests/amd_smi_test/functional/evt_notif_read_write.h +++ b/tests/amd_smi_test/functional/evt_notif_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/fan_read.cc b/tests/amd_smi_test/functional/fan_read.cc old mode 100755 new mode 100644 index 78e257e895..16d685013b --- a/tests/amd_smi_test/functional/fan_read.cc +++ b/tests/amd_smi_test/functional/fan_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/fan_read.h b/tests/amd_smi_test/functional/fan_read.h old mode 100755 new mode 100644 index ac5156f6dc..619b40af54 --- a/tests/amd_smi_test/functional/fan_read.h +++ b/tests/amd_smi_test/functional/fan_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/fan_read_write.cc b/tests/amd_smi_test/functional/fan_read_write.cc old mode 100755 new mode 100644 index e3d7c962ad..6beaeb00a4 --- a/tests/amd_smi_test/functional/fan_read_write.cc +++ b/tests/amd_smi_test/functional/fan_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/fan_read_write.h b/tests/amd_smi_test/functional/fan_read_write.h old mode 100755 new mode 100644 index 07fe17f8a5..126681c53d --- a/tests/amd_smi_test/functional/fan_read_write.h +++ b/tests/amd_smi_test/functional/fan_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/frequencies_read.cc b/tests/amd_smi_test/functional/frequencies_read.cc old mode 100755 new mode 100644 index 92975be224..e6f94479be --- a/tests/amd_smi_test/functional/frequencies_read.cc +++ b/tests/amd_smi_test/functional/frequencies_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/frequencies_read.h b/tests/amd_smi_test/functional/frequencies_read.h old mode 100755 new mode 100644 index 7d58403ef6..4fcad59bab --- a/tests/amd_smi_test/functional/frequencies_read.h +++ b/tests/amd_smi_test/functional/frequencies_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/frequencies_read_write.cc b/tests/amd_smi_test/functional/frequencies_read_write.cc old mode 100755 new mode 100644 index bea412a366..0d21b4f47d --- a/tests/amd_smi_test/functional/frequencies_read_write.cc +++ b/tests/amd_smi_test/functional/frequencies_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/frequencies_read_write.h b/tests/amd_smi_test/functional/frequencies_read_write.h old mode 100755 new mode 100644 index 0757a35cd5..25c11a1926 --- a/tests/amd_smi_test/functional/frequencies_read_write.h +++ b/tests/amd_smi_test/functional/frequencies_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/gpu_busy_read.cc b/tests/amd_smi_test/functional/gpu_busy_read.cc old mode 100755 new mode 100644 index 2d20c6eead..0dc9eb566e --- a/tests/amd_smi_test/functional/gpu_busy_read.cc +++ b/tests/amd_smi_test/functional/gpu_busy_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/gpu_busy_read.h b/tests/amd_smi_test/functional/gpu_busy_read.h old mode 100755 new mode 100644 index 8d734d2012..6624607aac --- a/tests/amd_smi_test/functional/gpu_busy_read.h +++ b/tests/amd_smi_test/functional/gpu_busy_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/gpu_metrics_read.cc b/tests/amd_smi_test/functional/gpu_metrics_read.cc index ea86c2982f..f19d6a2768 100644 --- a/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/gpu_metrics_read.h b/tests/amd_smi_test/functional/gpu_metrics_read.h index de75bf1d96..85b6a8740a 100644 --- a/tests/amd_smi_test/functional/gpu_metrics_read.h +++ b/tests/amd_smi_test/functional/gpu_metrics_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/hw_topology_read.cc b/tests/amd_smi_test/functional/hw_topology_read.cc old mode 100755 new mode 100644 index 7f1e095758..56860aa7c0 --- a/tests/amd_smi_test/functional/hw_topology_read.cc +++ b/tests/amd_smi_test/functional/hw_topology_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -60,6 +60,7 @@ typedef struct { uint64_t hops; uint64_t weight; bool accessible; + amdsmi_p2p_capability_t cap; } gpu_link_t; TestHWTopologyRead::TestHWTopologyRead() : TestBase() { @@ -136,9 +137,11 @@ void TestHWTopologyRead::Run(void) { gpu_links[dv_ind_src][dv_ind_dst].hops = 0; gpu_links[dv_ind_src][dv_ind_dst].weight = 0; gpu_links[dv_ind_src][dv_ind_dst].accessible = true; + gpu_links[dv_ind_src][dv_ind_dst].cap = + {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}; } else { amdsmi_io_link_type_t type; - err = amdsmi_topo_get_link_type(processor_handles_[dv_ind_src], + err = amdsmi_topo_get_link_type(processor_handles_[dv_ind_src], processor_handles_[dv_ind_dst], &gpu_links[dv_ind_src][dv_ind_dst].hops, &type); if (err != AMDSMI_STATUS_SUCCESS) { @@ -170,6 +173,34 @@ void TestHWTopologyRead::Run(void) { } } } + err = amdsmi_topo_get_p2p_status(processor_handles_[dv_ind_src], + processor_handles_[dv_ind_dst], + &type, &gpu_links[dv_ind_src][dv_ind_dst].cap); + if (err != AMDSMI_STATUS_SUCCESS) { + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << + "\t**Link Type. read: Not supported on this machine" + << std::endl; + return; + } + } else { + CHK_ERR_ASRT(err) + } + } else { + switch (type) { + case AMDSMI_IOLINK_TYPE_PCIEXPRESS: + case AMDSMI_IOLINK_TYPE_XGMI: + // Do nothing, the type is printed by the previous test for amdsmi_topo_get_link_type + break; + default: + gpu_links[dv_ind_src][dv_ind_dst].type = "XXXX"; + IF_VERB(STANDARD) { + std::cout << "\t**Invalid IO LINK type. type=" << type << + std::endl; + } + } + } err = amdsmi_topo_get_link_weight(processor_handles_[dv_ind_src], processor_handles_[dv_ind_dst], &gpu_links[dv_ind_src][dv_ind_dst].weight); @@ -286,6 +317,7 @@ void TestHWTopologyRead::Run(void) { std::cout << std::endl; } std::cout << std::endl; + std::cout << "**Access between two GPUs**" << std::endl; std::cout << " "; for (i = 0; i < num_devices; ++i) { @@ -303,4 +335,125 @@ void TestHWTopologyRead::Run(void) { std::cout << std::endl; } std::cout << std::endl; + + std::cout << "**Cache coherency between two GPUs**" << std::endl; + std::cout << " "; + for (i = 0; i < num_devices; ++i) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(12) << std::left << tmp; + } + std::cout << std::endl; + for (i = 0; i < num_devices; i++) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(6) << std::left << tmp; + for (j = 0; j < num_devices; j++) { + if (i == j) { + std::cout << std::setw(12) << std::left << "X"; + continue; + } + + if (gpu_links[i][j].cap.is_iolink_coherent == UINT8_MAX) { + std::cout << std::setw(12) << std::left << "N/A"; + continue; + } + + std::cout << std::setw(12) << std::left + << (gpu_links[i][j].cap.is_iolink_coherent ? "C" : "NC"); + } + std::cout << std::endl; + } + std::cout << std::endl; + + std::cout << "**Atomics between two GPUs**" << std::endl; + std::cout << " "; + for (i = 0; i < num_devices; ++i) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(12) << std::left << tmp; + } + std::cout << std::endl; + for (i = 0; i < num_devices; i++) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(6) << std::left << tmp; + for (j = 0; j < num_devices; j++) { + if (i == j) { + std::cout << std::setw(12) << std::left << "X"; + continue; + } + + if (gpu_links[i][j].cap.is_iolink_atomics_64bit == UINT8_MAX || + gpu_links[i][j].cap.is_iolink_atomics_32bit == UINT8_MAX) { + std::cout << std::setw(12) << std::left << "N/A"; + continue; + } + + tmp = gpu_links[i][j].cap.is_iolink_atomics_64bit ? "64" : ""; + if (gpu_links[i][j].cap.is_iolink_atomics_32bit) { + if (!tmp.empty()) { + tmp += ","; + } + tmp += "32"; + } + std::cout << std::setw(12) << std::left << (tmp.empty() ? "N/A" : tmp); + } + std::cout << std::endl; + } + std::cout << std::endl; + + std::cout << "**DMA between two GPUs**" << std::endl; + std::cout << " "; + for (i = 0; i < num_devices; ++i) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(12) << std::left << tmp; + } + std::cout << std::endl; + for (i = 0; i < num_devices; i++) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(6) << std::left << tmp; + for (j = 0; j < num_devices; j++) { + if (i == j) { + std::cout << std::setw(12) << std::left << "X"; + continue; + } + + if (gpu_links[i][j].cap.is_iolink_dma == UINT8_MAX) { + std::cout << std::setw(12) << std::left << "N/A"; + continue; + } + + std::cout << std::boolalpha; + std::cout << std::setw(12) << std::left + << static_cast(gpu_links[i][j].cap.is_iolink_dma); + } + std::cout << std::endl; + } + std::cout << std::endl; + + std::cout << "**BI-Directional between two GPUs**" << std::endl; + std::cout << " "; + for (i = 0; i < num_devices; ++i) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(12) << std::left << tmp; + } + std::cout << std::endl; + for (i = 0; i < num_devices; i++) { + tmp = "GPU" + std::to_string(i); + std::cout << std::setw(6) << std::left << tmp; + for (j = 0; j < num_devices; j++) { + if (i == j) { + std::cout << std::setw(12) << std::left << "X"; + continue; + } + + if (gpu_links[i][j].cap.is_iolink_dma == UINT8_MAX) { + std::cout << std::setw(12) << std::left << "N/A"; + continue; + } + + std::cout << std::boolalpha; + std::cout << std::setw(12) << std::left + << static_cast(gpu_links[i][j].cap.is_iolink_bi_directional); + } + std::cout << std::endl; + } + std::cout << std::endl; } diff --git a/tests/amd_smi_test/functional/hw_topology_read.h b/tests/amd_smi_test/functional/hw_topology_read.h old mode 100755 new mode 100644 index 5e04a3b950..4938e45904 --- a/tests/amd_smi_test/functional/hw_topology_read.h +++ b/tests/amd_smi_test/functional/hw_topology_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/id_info_read.cc b/tests/amd_smi_test/functional/id_info_read.cc old mode 100755 new mode 100644 index 322da175c3..de92faf5cd --- a/tests/amd_smi_test/functional/id_info_read.cc +++ b/tests/amd_smi_test/functional/id_info_read.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -123,8 +123,8 @@ void TestIdInfoRead::Run(void) { } // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + amdsmi_asic_info_t asic_info; + err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asic_info); CHK_ERR_ASRT(err) // device name, brand, serial_number @@ -215,7 +215,7 @@ void TestIdInfoRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**Sub-system Vendor ID: 0x" << std::hex << - asci_info.subvendor_id << std::endl; + asic_info.subvendor_id << std::endl; } err = amdsmi_get_gpu_vendor_name(processor_handles_[i], buffer, kBufferLen); diff --git a/tests/amd_smi_test/functional/id_info_read.h b/tests/amd_smi_test/functional/id_info_read.h old mode 100755 new mode 100644 index 4fcff497e7..1d7f3b5eab --- a/tests/amd_smi_test/functional/id_info_read.h +++ b/tests/amd_smi_test/functional/id_info_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/init_shutdown_refcount.cc b/tests/amd_smi_test/functional/init_shutdown_refcount.cc old mode 100755 new mode 100644 index 222486be6a..59f9839ae9 --- a/tests/amd_smi_test/functional/init_shutdown_refcount.cc +++ b/tests/amd_smi_test/functional/init_shutdown_refcount.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/init_shutdown_refcount.h b/tests/amd_smi_test/functional/init_shutdown_refcount.h old mode 100755 new mode 100644 index 9c5a7183ee..7507235c75 --- a/tests/amd_smi_test/functional/init_shutdown_refcount.h +++ b/tests/amd_smi_test/functional/init_shutdown_refcount.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/mem_page_info_read.cc b/tests/amd_smi_test/functional/mem_page_info_read.cc old mode 100755 new mode 100644 index 3d4dd866e5..f4605f8d58 --- a/tests/amd_smi_test/functional/mem_page_info_read.cc +++ b/tests/amd_smi_test/functional/mem_page_info_read.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/mem_page_info_read.h b/tests/amd_smi_test/functional/mem_page_info_read.h old mode 100755 new mode 100644 index e17e127b15..c5121f00e3 --- a/tests/amd_smi_test/functional/mem_page_info_read.h +++ b/tests/amd_smi_test/functional/mem_page_info_read.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/mem_util_read.cc b/tests/amd_smi_test/functional/mem_util_read.cc old mode 100755 new mode 100644 index 3a467d272a..1513b97be6 --- a/tests/amd_smi_test/functional/mem_util_read.cc +++ b/tests/amd_smi_test/functional/mem_util_read.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/mem_util_read.h b/tests/amd_smi_test/functional/mem_util_read.h old mode 100755 new mode 100644 index f9ea38ad9a..86becb48aa --- a/tests/amd_smi_test/functional/mem_util_read.h +++ b/tests/amd_smi_test/functional/mem_util_read.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/metrics_counter_read.cc b/tests/amd_smi_test/functional/metrics_counter_read.cc index 5e6be28a0e..f12f4b1c87 100644 --- a/tests/amd_smi_test/functional/metrics_counter_read.cc +++ b/tests/amd_smi_test/functional/metrics_counter_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/metrics_counter_read.h b/tests/amd_smi_test/functional/metrics_counter_read.h index 3c59d58e6c..bd507fdaa4 100644 --- a/tests/amd_smi_test/functional/metrics_counter_read.h +++ b/tests/amd_smi_test/functional/metrics_counter_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/mutual_exclusion.cc b/tests/amd_smi_test/functional/mutual_exclusion.cc old mode 100755 new mode 100644 index b9a0fd8afa..f7bef7ce5d --- a/tests/amd_smi_test/functional/mutual_exclusion.cc +++ b/tests/amd_smi_test/functional/mutual_exclusion.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -200,8 +200,8 @@ void TestMutualExclusion::Run(void) { ret = amdsmi_get_gpu_id(processor_handles_[0], &dmy_ui16); // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - ret = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + amdsmi_asic_info_t asic_info; + ret = amdsmi_get_gpu_asic_info(processor_handles_[0], &asic_info); CHECK_RET(ret, AMDSMI_STATUS_BUSY); // device name, brand, serial_number diff --git a/tests/amd_smi_test/functional/mutual_exclusion.h b/tests/amd_smi_test/functional/mutual_exclusion.h old mode 100755 new mode 100644 index c650876ca8..0e04770439 --- a/tests/amd_smi_test/functional/mutual_exclusion.h +++ b/tests/amd_smi_test/functional/mutual_exclusion.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/overdrive_read.cc b/tests/amd_smi_test/functional/overdrive_read.cc old mode 100755 new mode 100644 index 57e362a502..e12fc033d7 --- a/tests/amd_smi_test/functional/overdrive_read.cc +++ b/tests/amd_smi_test/functional/overdrive_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/overdrive_read.h b/tests/amd_smi_test/functional/overdrive_read.h old mode 100755 new mode 100644 index acc6bb4973..1ffb87469e --- a/tests/amd_smi_test/functional/overdrive_read.h +++ b/tests/amd_smi_test/functional/overdrive_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/overdrive_read_write.cc b/tests/amd_smi_test/functional/overdrive_read_write.cc old mode 100755 new mode 100644 index 23837d293d..dd877ea0b5 --- a/tests/amd_smi_test/functional/overdrive_read_write.cc +++ b/tests/amd_smi_test/functional/overdrive_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/overdrive_read_write.h b/tests/amd_smi_test/functional/overdrive_read_write.h old mode 100755 new mode 100644 index af46e4374a..15782159cc --- a/tests/amd_smi_test/functional/overdrive_read_write.h +++ b/tests/amd_smi_test/functional/overdrive_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/pci_read_write.cc b/tests/amd_smi_test/functional/pci_read_write.cc old mode 100755 new mode 100644 index 5d6d43d0f2..f0ed27adbe --- a/tests/amd_smi_test/functional/pci_read_write.cc +++ b/tests/amd_smi_test/functional/pci_read_write.cc @@ -2,7 +2,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/pci_read_write.h b/tests/amd_smi_test/functional/pci_read_write.h old mode 100755 new mode 100644 index c34dd0832b..6c31e1bfd1 --- a/tests/amd_smi_test/functional/pci_read_write.h +++ b/tests/amd_smi_test/functional/pci_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_cntr_read_write.cc b/tests/amd_smi_test/functional/perf_cntr_read_write.cc old mode 100755 new mode 100644 index 8a4901be1c..a00ded6315 --- a/tests/amd_smi_test/functional/perf_cntr_read_write.cc +++ b/tests/amd_smi_test/functional/perf_cntr_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_cntr_read_write.h b/tests/amd_smi_test/functional/perf_cntr_read_write.h old mode 100755 new mode 100644 index bea0988d28..3c9592a967 --- a/tests/amd_smi_test/functional/perf_cntr_read_write.h +++ b/tests/amd_smi_test/functional/perf_cntr_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_determinism.h b/tests/amd_smi_test/functional/perf_determinism.h index 1d7cb5dbff..65e8762850 100644 --- a/tests/amd_smi_test/functional/perf_determinism.h +++ b/tests/amd_smi_test/functional/perf_determinism.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_level_read.cc b/tests/amd_smi_test/functional/perf_level_read.cc old mode 100755 new mode 100644 index 414b4422ea..6fac8b3229 --- a/tests/amd_smi_test/functional/perf_level_read.cc +++ b/tests/amd_smi_test/functional/perf_level_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_level_read.h b/tests/amd_smi_test/functional/perf_level_read.h old mode 100755 new mode 100644 index 1a686cf228..255829e841 --- a/tests/amd_smi_test/functional/perf_level_read.h +++ b/tests/amd_smi_test/functional/perf_level_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_level_read_write.cc b/tests/amd_smi_test/functional/perf_level_read_write.cc old mode 100755 new mode 100644 index fde494dbc3..2d0f0ada39 --- a/tests/amd_smi_test/functional/perf_level_read_write.cc +++ b/tests/amd_smi_test/functional/perf_level_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/perf_level_read_write.h b/tests/amd_smi_test/functional/perf_level_read_write.h old mode 100755 new mode 100644 index 9fa74f19d1..2b8e73dcf2 --- a/tests/amd_smi_test/functional/perf_level_read_write.h +++ b/tests/amd_smi_test/functional/perf_level_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_cap_read_write.cc b/tests/amd_smi_test/functional/power_cap_read_write.cc old mode 100755 new mode 100644 index 5e1a065a61..7166d0ad9d --- a/tests/amd_smi_test/functional/power_cap_read_write.cc +++ b/tests/amd_smi_test/functional/power_cap_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_cap_read_write.h b/tests/amd_smi_test/functional/power_cap_read_write.h old mode 100755 new mode 100644 index c913cec99a..0e0c92149c --- a/tests/amd_smi_test/functional/power_cap_read_write.h +++ b/tests/amd_smi_test/functional/power_cap_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_read.cc b/tests/amd_smi_test/functional/power_read.cc old mode 100755 new mode 100644 index 98f2b1ffd0..6db23773a9 --- a/tests/amd_smi_test/functional/power_read.cc +++ b/tests/amd_smi_test/functional/power_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2019-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_read.h b/tests/amd_smi_test/functional/power_read.h old mode 100755 new mode 100644 index 49d3a4cb43..4050171437 --- a/tests/amd_smi_test/functional/power_read.h +++ b/tests/amd_smi_test/functional/power_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019-2023, Advanced Micro Devices, Inc. + * Copyright (c) 2019-2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_read_write.cc b/tests/amd_smi_test/functional/power_read_write.cc old mode 100755 new mode 100644 index 24e496c14b..fc93a96359 --- a/tests/amd_smi_test/functional/power_read_write.cc +++ b/tests/amd_smi_test/functional/power_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/power_read_write.h b/tests/amd_smi_test/functional/power_read_write.h old mode 100755 new mode 100644 index 4edb21cfb5..6b7bc43697 --- a/tests/amd_smi_test/functional/power_read_write.h +++ b/tests/amd_smi_test/functional/power_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/process_info_read.cc b/tests/amd_smi_test/functional/process_info_read.cc old mode 100755 new mode 100644 index d88bbe498a..220b683222 --- a/tests/amd_smi_test/functional/process_info_read.cc +++ b/tests/amd_smi_test/functional/process_info_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/process_info_read.h b/tests/amd_smi_test/functional/process_info_read.h old mode 100755 new mode 100644 index 6b78411d7a..3cc808cdb2 --- a/tests/amd_smi_test/functional/process_info_read.h +++ b/tests/amd_smi_test/functional/process_info_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/sys_info_read.cc b/tests/amd_smi_test/functional/sys_info_read.cc old mode 100755 new mode 100644 index aa337bc210..35d40e0818 --- a/tests/amd_smi_test/functional/sys_info_read.cc +++ b/tests/amd_smi_test/functional/sys_info_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -48,6 +48,7 @@ #include #include +#include #include #include "amd_smi/amdsmi.h" @@ -58,7 +59,9 @@ TestSysInfoRead::TestSysInfoRead() : TestBase() { set_title("AMDSMI System Info Read Test"); set_description("This test verifies that system information such as the " - "BDFID, AMDSMI version, VBIOS version, etc. can be read properly."); + "BDFID, AMDSMI version, VBIOS version, " + "vendor_id, unique_id, target_gfx_version, kfd_id, node_id, etc. " + "can be read properly."); } TestSysInfoRead::~TestSysInfoRead(void) { @@ -150,22 +153,27 @@ void TestSysInfoRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + // vendor_id, unique_id, target_gfx_version + amdsmi_asic_info_t asic_info = {}; + err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asic_info); if (err == AMDSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**amdsmi_dev_unique_id() is not supported" " on this machine" << std::endl; + EXPECT_EQ(asic_info.target_graphics_version, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); } else { if (err == AMDSMI_STATUS_SUCCESS) { IF_VERB(STANDARD) { - std:: cout << "\t**GPU PCIe Vendor : " - << asci_info.vendor_name << std::endl; + std:: cout << "\t**GPU PCIe Vendor : " + << asic_info.vendor_name << std::endl; + std::cout << "\t**Target GFX version: " << std::dec + << asic_info.target_graphics_version << "\n"; } + EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(asic_info.target_graphics_version, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); @@ -175,6 +183,27 @@ void TestSysInfoRead::Run(void) { } } + // kfd_id, node_id + amdsmi_kfd_info_t kfd_info = {}; + err = amdsmi_get_gpu_kfd_info(processor_handles_[i], &kfd_info); + if (err != AMDSMI_STATUS_SUCCESS) { + EXPECT_EQ(kfd_info.kfd_id, std::numeric_limits::max()); + EXPECT_EQ(kfd_info.node_id, std::numeric_limits::max()); + } else { + IF_VERB(STANDARD) { + std::cout << "\t**KFD ID: " << std::dec + << kfd_info.kfd_id << "\n"; + std::cout << "\t**Node ID: " << std::dec + << kfd_info.node_id << "\n"; + } + EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(kfd_info.kfd_id, std::numeric_limits::max()); + EXPECT_NE(kfd_info.node_id, std::numeric_limits::max()); + } + // Verify api support checking functionality is working + err = amdsmi_get_gpu_kfd_info(processor_handles_[i], nullptr); + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + err = amdsmi_get_lib_version(&ver); CHK_ERR_ASRT(err) diff --git a/tests/amd_smi_test/functional/sys_info_read.h b/tests/amd_smi_test/functional/sys_info_read.h old mode 100755 new mode 100644 index d241378476..c6e75a1e21 --- a/tests/amd_smi_test/functional/sys_info_read.h +++ b/tests/amd_smi_test/functional/sys_info_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/temp_read.cc b/tests/amd_smi_test/functional/temp_read.cc old mode 100755 new mode 100644 index 5169047767..4ed866ecca --- a/tests/amd_smi_test/functional/temp_read.cc +++ b/tests/amd_smi_test/functional/temp_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -137,8 +137,7 @@ void TestTempRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); IF_VERB(STANDARD) { - std::cout << "\t**" << label << ": " << val_i64/1000 << - "C" << std::endl; + std::cout << "\t**" << label << ": " << val_i64 << "C" << std::endl; } }; for (type = AMDSMI_TEMPERATURE_TYPE_FIRST; type <= AMDSMI_TEMPERATURE_TYPE__MAX; ++type) { diff --git a/tests/amd_smi_test/functional/temp_read.h b/tests/amd_smi_test/functional/temp_read.h old mode 100755 new mode 100644 index eaca3b3b05..e9df5b21fb --- a/tests/amd_smi_test/functional/temp_read.h +++ b/tests/amd_smi_test/functional/temp_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/version_read.cc b/tests/amd_smi_test/functional/version_read.cc old mode 100755 new mode 100644 index 1a11263185..5b97e3acf2 --- a/tests/amd_smi_test/functional/version_read.cc +++ b/tests/amd_smi_test/functional/version_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/version_read.h b/tests/amd_smi_test/functional/version_read.h old mode 100755 new mode 100644 index 5b8bbdc988..d9dcc5b421 --- a/tests/amd_smi_test/functional/version_read.h +++ b/tests/amd_smi_test/functional/version_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/volt_freq_curv_read.cc b/tests/amd_smi_test/functional/volt_freq_curv_read.cc old mode 100755 new mode 100644 index 3d061698b2..9b875bf3ec --- a/tests/amd_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/amd_smi_test/functional/volt_freq_curv_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/volt_freq_curv_read.h b/tests/amd_smi_test/functional/volt_freq_curv_read.h old mode 100755 new mode 100644 index ee9de3a1dd..5626f7e313 --- a/tests/amd_smi_test/functional/volt_freq_curv_read.h +++ b/tests/amd_smi_test/functional/volt_freq_curv_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/volt_read.cc b/tests/amd_smi_test/functional/volt_read.cc index c28070dfa7..003ac394c9 100644 --- a/tests/amd_smi_test/functional/volt_read.cc +++ b/tests/amd_smi_test/functional/volt_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/volt_read.h b/tests/amd_smi_test/functional/volt_read.h index 135dfccd56..0020bc0f90 100644 --- a/tests/amd_smi_test/functional/volt_read.h +++ b/tests/amd_smi_test/functional/volt_read.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/xgmi_read_write.cc b/tests/amd_smi_test/functional/xgmi_read_write.cc old mode 100755 new mode 100644 index 4685ece760..48b7042a68 --- a/tests/amd_smi_test/functional/xgmi_read_write.cc +++ b/tests/amd_smi_test/functional/xgmi_read_write.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/functional/xgmi_read_write.h b/tests/amd_smi_test/functional/xgmi_read_write.h old mode 100755 new mode 100644 index 87353d8298..300fe49d32 --- a/tests/amd_smi_test/functional/xgmi_read_write.h +++ b/tests/amd_smi_test/functional/xgmi_read_write.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/main.cc b/tests/amd_smi_test/main.cc index cd13497a92..91842153c0 100644 --- a/tests/amd_smi_test/main.cc +++ b/tests/amd_smi_test/main.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2023, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_base.cc b/tests/amd_smi_test/test_base.cc index a242711e5e..968b970124 100644 --- a/tests/amd_smi_test/test_base.cc +++ b/tests/amd_smi_test/test_base.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_base.h b/tests/amd_smi_test/test_base.h index ffd6a55116..514be909dc 100644 --- a/tests/amd_smi_test/test_base.h +++ b/tests/amd_smi_test/test_base.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_common.cc b/tests/amd_smi_test/test_common.cc index 9335626893..0776da9eec 100644 --- a/tests/amd_smi_test/test_common.cc +++ b/tests/amd_smi_test/test_common.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_common.h b/tests/amd_smi_test/test_common.h index ba00fad2ce..767cb60323 100644 --- a/tests/amd_smi_test/test_common.h +++ b/tests/amd_smi_test/test_common.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_utils.cc b/tests/amd_smi_test/test_utils.cc index b1f461ffdc..c3be08ccb6 100644 --- a/tests/amd_smi_test/test_utils.cc +++ b/tests/amd_smi_test/test_utils.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/tests/amd_smi_test/test_utils.h b/tests/amd_smi_test/test_utils.h index b85841d808..5fe2e57512 100644 --- a/tests/amd_smi_test/test_utils.h +++ b/tests/amd_smi_test/test_utils.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: diff --git a/pytest/CMakeLists.txt b/tests/python_unittest/CMakeLists.txt similarity index 89% rename from pytest/CMakeLists.txt rename to tests/python_unittest/CMakeLists.txt index 693d3b9f57..9dcedabcd8 100644 --- a/pytest/CMakeLists.txt +++ b/tests/python_unittest/CMakeLists.txt @@ -18,8 +18,8 @@ message("--------CPACK_COMPONENT_INCLUDE_TOPLEVEL_DIRECTORY: " ${CPACK_COMPONENT # copy python test files into shared directory install( DIRECTORY ./ - DESTINATION ${SHARE_INSTALL_PREFIX}/tests/pytest/ - COMPONENT dev + DESTINATION ${SHARE_INSTALL_PREFIX}/tests/python_unittest/ + COMPONENT ${TESTS_COMPONENT} USE_SOURCE_PERMISSIONS FILES_MATCHING PATTERN "*.py" diff --git a/pytest/README.md b/tests/python_unittest/README.md similarity index 90% rename from pytest/README.md rename to tests/python_unittest/README.md index aaee9d0852..26e9fb7dbd 100644 --- a/pytest/README.md +++ b/tests/python_unittest/README.md @@ -13,72 +13,35 @@ Follow our install/build guides to ensure the Python API is installed correctly ## How to Run ### Basic How To The 2 tests are in this PATH: -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py``` -```/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py``` +```/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py``` - -The recommended method to run the tests: -Pytest verbose -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -s -v``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -s -v``` - -Pytest only (not verbose) -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v``` +The recommended method to run the tests: Unittest verbose -```/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v``` -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -v``` Unittest only (not verbose) -```/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -b -v``` -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -b -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -b -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -b -v``` See sections below for more detailed options with examples. -### Unittest or Pytest Run -The Unittest Run calls the tests directly, assuming pytest is correctly installed in the PATH. -This is more straightforward and intuitive but with less control for options. For example, the cache provider will always be used. - -```/opt/rocm/share/amd_smi/tests/pytest/*``` - -options: - - -h, --help show this help message and exit - - -v, --verbose Verbose output - - -q, --quiet Quiet output - - -b, --buffer Buffer stdout and stderr during tests - - -k "TESTNAME" Only run tests which match the given substring - -The Pytest Run could be more reliable and consistent, especially if pytest is not in the PATH. -This offers more options and flexibility, such as the option to disable the cache provider, ensuring completely independent runs. - -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/*``` - -options: - - -h, --help show this help message and exit - - --co Collect and list tests - - -p no:cacheprovider Disable cache provider - - -v, --verbose Verbose output - - -q, --quiet Quiet output - - -s, --capture=no Disables output capturing, stdout output - - -k "TESTNAME" Only run tests which match the given substring - -The complete list of options can be accessed here [Pytest command-line flags](https://docs.pytest.org/en/latest/reference/reference.html#command-line-flags). - ## Unittest Run Options ### Unittest Run: Verbose on Helpful to see print outs of Python. -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -v``` -```/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -v``` ex.
Click for example: Unittest run: verbose on ~~~shell -/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v +/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -v test_init (__main__.TestAmdSmiInit) ... ok test_bad_page_info (__main__.TestAmdSmiPythonInterface) ... ###Test amdsmi_get_gpu_bad_page_info @@ -403,16 +366,16 @@ OK ### Unittest Run: Verbose on + Filter (or exclude) a test -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "test_walkthrough" -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "test_walkthrough" -v``` -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "not test_walkthrough" -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "not test_walkthrough" -v``` ex.
Click for example: Unittest Run: Verbose on + Filter (or exclude) a Test ~~~shell -> /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "test_bdf_device_id" -v +> /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "test_bdf_device_id" -v test_bdf_device_id (__main__.TestAmdSmiPythonInterface) ... ###Test Processor 0, bdf: 0000:08:00.0 ###Test amdsmi_get_gpu_vbios_info @@ -453,16 +416,16 @@ OK Runs all tests. Silence print statements to stdout. Lists tests results. This is also the best way to list all tests available. -```/opt/rocm/share/amd_smi/tests/pytest/integration_test.py -b -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -b -v``` -```/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -b -v``` +```/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -b -v``` ex.
Click for example: Unittest Run: Silence stdout (print statements) and run all tests ~~~shell -/opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -b -v +/opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -b -v test_check_res (__main__.TestAmdSmiPythonBDF) ... ok test_format_bdf (__main__.TestAmdSmiPythonBDF) ... ok test_parse_bdf (__main__.TestAmdSmiPythonBDF) ... ok @@ -477,16 +440,16 @@ OK ## Pytest Run Options ### Pytest: List tests -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py --co``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py --co``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py --co``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py --co``` ex.
Click for example: Pytest: List tests ~~~shell -python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py --co +python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py --co ===================================================== test session starts ===================================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 rootdir: /opt/rocm/share/amd_smi @@ -511,53 +474,53 @@ collected 6 items
### Pytest Run: Verbose on -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -v``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -v``` ex.
Click for example: Pytest Run: verbose on ~~~shell - python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v + python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -v ===================================================== test session starts ===================================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 -- /usr/bin/python3 rootdir: /opt/rocm/share/amd_smi configfile: pyproject.toml collected 3 items -../../opt/rocm/share/amd_smi/tests/pytest/unit_tests.py::TestAmdSmiPythonBDF::test_check_res PASSED [ 33%] -../../opt/rocm/share/amd_smi/tests/pytest/unit_tests.py::TestAmdSmiPythonBDF::test_format_bdf PASSED [ 66%] -../../opt/rocm/share/amd_smi/tests/pytest/unit_tests.py::TestAmdSmiPythonBDF::test_parse_bdf PASSED [100%] +../../opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py::TestAmdSmiPythonBDF::test_check_res PASSED [ 33%] +../../opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py::TestAmdSmiPythonBDF::test_format_bdf PASSED [ 66%] +../../opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py::TestAmdSmiPythonBDF::test_parse_bdf PASSED [100%] ====================================================== 3 passed in 0.04s ====================================================== ~~~
### Pytest Run: Verbose on + stdout (print statements) -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -s -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -s -v``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -s -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -s -v``` ex.
Click for example: Pytest Run: verbose on + stdout (print statements) ~~~shell -python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -s -v +python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -s -v ===================================================== test session starts ===================================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 -- /usr/bin/python3 rootdir: /opt/rocm/share/amd_smi configfile: pyproject.toml collected 6 items -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiInit::test_init PASSED -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_bad_page_info ###Test amdsmi_get_gpu_bad_page_info +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiInit::test_init PASSED +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_bad_page_info ###Test amdsmi_get_gpu_bad_page_info **** [ERROR] | Test: test_bad_page_info | Caught AmdSmiLibraryException PASSED -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_bdf_device_id ###Test Processor 0, bdf: 0000:08:00.0 +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_bdf_device_id ###Test Processor 0, bdf: 0000:08:00.0 ###Test amdsmi_get_gpu_vbios_info @@ -584,13 +547,13 @@ PASSED uuid is: 1fff73a3-0000-1000-8075-223e5e64eac1 PASSED -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_ecc ###Test Processor 0, bdf: 0000:08:00.0 +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_ecc ###Test Processor 0, bdf: 0000:08:00.0 ###Test amdsmi_get_gpu_ras_feature_info **** [ERROR] | Test: test_ecc | Caught AmdSmiLibraryException PASSED -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_gpu_performance ###Test Processor 0, bdf: 0000:08:00.0 +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_gpu_performance ###Test Processor 0, bdf: 0000:08:00.0 ###Test amdsmi_get_gpu_activity engine_usage['gfx_activity'] is: 1 % @@ -725,7 +688,7 @@ PASSED pcie_info['pcie_metric']['pcie_nak_sent_count'] is: N/A pcie_info['pcie_metric']['pcie_nak_received_count'] is: N/A PASSED -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_walkthrough ###Test amdsmi_get_processor_handles() +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_walkthrough ###Test amdsmi_get_processor_handles() ###Test amdsmi_get_gpu_device_bdf() | START walk_through | processor i = 0 ###Test Processor 0, bdf: 0000:08:00.0 @@ -871,27 +834,27 @@ PASSED ### Pytest Run: Verbose on + Filter (or exclude) a Test Use [Pytest: List tests](###-Pytest:-List-tests) then either exclude (with "not") or only run the specified test. -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "test_gpu_performance" -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "test_gpu_performance" -v``` -```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "not test_gpu_performance" -v``` +```python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "not test_gpu_performance" -v``` ex.
Click for example: Pytest Run: Verbose on + Filter (or exclude) a Test ~~~shell -python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "not test_gpu_performance" -v +python3 -m pytest -p no:cacheprovider /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "not test_gpu_performance" -v ===================================================== test session starts ===================================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 -- /usr/bin/python3 rootdir: /opt/rocm/share/amd_smi configfile: pyproject.toml collected 6 items / 1 deselected / 5 selected -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiInit::test_init PASSED [ 20%] -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_bad_page_info PASSED [ 40%] -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_bdf_device_id PASSED [ 60%] -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_ecc PASSED [ 80%] -../../opt/rocm/share/amd_smi/tests/pytest/integration_test.py::TestAmdSmiPythonInterface::test_walkthrough PASSED [100%] +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiInit::test_init PASSED [ 20%] +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_bad_page_info PASSED [ 40%] +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_bdf_device_id PASSED [ 60%] +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_ecc PASSED [ 80%] +../../opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py::TestAmdSmiPythonInterface::test_walkthrough PASSED [100%] =============================================== 5 passed, 1 deselected in 0.09s =============================================== ~~~ @@ -902,14 +865,14 @@ collected 6 items / 1 deselected / 5 selected Please refer to Python's UnitTest documentation for better overview of commands to run. ```shell -python3 /opt/rocm/share/amd_smi/tests/pytest/unit_tests.py -v +python3 /opt/rocm/share/amd_smi/tests/python_unittest/unit_tests.py -v test_check_res (tests.amd_smi_test.py-test.unit_tests.TestAmdSmiPythonBDF) ... ok test_format_bdf (tests.amd_smi_test.py-test.unit_tests.TestAmdSmiPythonBDF) ... ok test_parse_bdf (tests.amd_smi_test.py-test.unit_tests.TestAmdSmiPythonBDF) ... ok ``` ```shell -python3 /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -v +python3 /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -v test_init (__main__.TestAmdSmiInit) ... ok test_bad_page_info (__main__.TestAmdSmiPythonInterface) ... ###Test amdsmi_get_gpu_bad_page_info @@ -1229,7 +1192,7 @@ OK ``` ```shell -(Tue Jul-7 12:07:47am)-(CPU 0.3%:0:Net 18)-(charpoag@mlsetools2:/opt/rocm/share/amd_smi/tests/pytest)-(44K:3) +(Tue Jul-7 12:07:47am)-(CPU 0.3%:0:Net 18)-(charpoag@mlsetools2:/opt/rocm/share/amd_smi/tests/python_unittest)-(44K:3) > python3 -m pytest -s -ra -vvv -p no:cacheprovider ==================================== test session starts ===================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 -- /usr/bin/python3 @@ -1553,7 +1516,7 @@ PASSED ``` ```shell -$ python3 /opt/rocm/share/amd_smi/tests/pytest/integration_test.py -k "*test_init" -vvv +$ python3 /opt/rocm/share/amd_smi/tests/python_unittest/integration_test.py -k "*test_init" -vvv test_init (__main__.TestAmdSmiInit) ... ok ---------------------------------------------------------------------- @@ -1564,7 +1527,7 @@ OK ``` ```shell -(Tue Jul-7 12:10:10am)-(CPU 0.3%:0:Net 16)-(charpoag@mlsetools2:/opt/rocm/share/amd_smi/tests/pytest)-(44K:3) +(Tue Jul-7 12:10:10am)-(CPU 0.3%:0:Net 16)-(charpoag@mlsetools2:/opt/rocm/share/amd_smi/tests/python_unittest)-(44K:3) > python3 -m pytest -ra -vvv -p no:cacheprovider ==================================== test session starts ===================================== platform linux -- Python 3.8.10, pytest-8.2.2, pluggy-1.5.0 -- /usr/bin/python3 diff --git a/pytest/__init__.py b/tests/python_unittest/__init__.py similarity index 100% rename from pytest/__init__.py rename to tests/python_unittest/__init__.py diff --git a/pytest/integration_test.py b/tests/python_unittest/integration_test.py similarity index 98% rename from pytest/integration_test.py rename to tests/python_unittest/integration_test.py index 71de1f7114..7c829e87bc 100755 --- a/pytest/integration_test.py +++ b/tests/python_unittest/integration_test.py @@ -509,6 +509,14 @@ def walk_through(self): asic_info['asic_serial'])) print(" asic_info['oam_id'] is: {}\n".format( asic_info['oam_id'])) + print(" asic_info['target_graphics_version'] is: {}\n".format( + asic_info['target_graphics_version'])) + print("\n###Test amdsmi_get_gpu_kfd_info \n") + kfd_info = amdsmi.amdsmi_get_gpu_kfd_info(processors[i]) + print(" kfd_info['kfd_id'] is: {}\n".format( + kfd_info['kfd_id'])) + print(" kfd_info['node_id'] is: {}\n".format( + kfd_info['node_id'])) print("###Test amdsmi_get_power_cap_info \n") power_info = amdsmi.amdsmi_get_power_cap_info(processors[i]) print(" power_info['dpm_cap'] is: {}".format( diff --git a/pytest/unit_tests.py b/tests/python_unittest/unit_tests.py similarity index 100% rename from pytest/unit_tests.py rename to tests/python_unittest/unit_tests.py diff --git a/third_party/shared_mutex/shared_mutex.cc b/third_party/shared_mutex/shared_mutex.cc old mode 100755 new mode 100644 index 54b0584887..97426cbd85 --- a/third_party/shared_mutex/shared_mutex.cc +++ b/third_party/shared_mutex/shared_mutex.cc @@ -21,6 +21,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include "shared_mutex.h" // NOLINT(build/include) #include // errno, ENOENT #include // O_RDWR, O_CREATE diff --git a/third_party/shared_mutex/shared_mutex.h b/third_party/shared_mutex/shared_mutex.h old mode 100755 new mode 100644 index d04d1d0984..0d83c11826 --- a/third_party/shared_mutex/shared_mutex.h +++ b/third_party/shared_mutex/shared_mutex.h @@ -1,5 +1,5 @@ /* -Modifications Copyright © 2019 – 2020 Advanced Micro Devices, Inc. All Rights +Modifications Copyright 2019 - 2024 Advanced Micro Devices, Inc. All Rights Reserved. Copyright (c) 2018 Oleg Yamnikov diff --git a/tools/amdsmi_quick_start.py b/tools/amdsmi_quick_start.py index 2108e29f91..4cec5be4a1 100644 --- a/tools/amdsmi_quick_start.py +++ b/tools/amdsmi_quick_start.py @@ -23,15 +23,13 @@ # This is not meant to serve best practices for development. # Run this post install with python3 -i quick_start.py - -from amdsmi import * -from pathlib import Path - import atexit import logging import signal import sys +from amdsmi import * +from pathlib import Path # Make exit & quit work without parens because it's annoying type(exit).__repr__ = sys.exit @@ -45,6 +43,8 @@ signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) atexit.register(amdsmi_shut_down) -devices = amdsmi_get_processor_handles() +gpus = amdsmi_get_processor_handles() +cpus = amdsmi_get_cpusocket_handles() -print(f"devices variable populated with:{devices}") +print(f"gpus variable populated with:{gpus}") +print(f"cpus variable populated with:{cpus}") diff --git a/tools/generator.py b/tools/generator.py index d9b6459d6e..dd32cc03dc 100644 --- a/tools/generator.py +++ b/tools/generator.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -31,7 +31,7 @@ from ctypeslib.clang2py import main as clangToPy HEADER = \ """ # -# Copyright (C) 2023 Advanced Micro Devices. All rights reserved. +# Copyright (C) 2024 Advanced Micro Devices. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in @@ -70,12 +70,23 @@ def parseArgument(): return args['output'], args['input'], args['library'], args['extra_args'] -def replace_line(full_path_file_name, string_to_repalce, new_string): +def replace_line(full_path_file_name, string_to_replace, new_string): + """ + Replaces a specific string in a file with a new string. + + Args: + full_path_file_name (str): The full path of the file to modify. + string_to_replace (str): The string to be replaced. + new_string (str): The new string to replace the old string with. + + Returns: + None + """ fh, abs_path = tempfile.mkstemp() with os.fdopen(fh, 'w') as new_file: with open(full_path_file_name, 'r+', encoding='UTF-8') as old_file: for line in old_file: - new_file.write(line.replace(string_to_repalce, new_string)) + new_file.write(line.replace(string_to_replace, new_string)) shutil.copymode(full_path_file_name, abs_path) os.remove(full_path_file_name) @@ -111,7 +122,7 @@ def main(): os_platform = platform.system() if os_platform == "Windows": - clang_include_dir += "\include" + clang_include_dir += "\\include" if "Program Files(x86)" in clang_include_dir: clang_include_dir = clang_include_dir.replace("Program Files(x86)", "Progra~2") elif "Program Files" in clang_include_dir: @@ -179,8 +190,21 @@ except OSError as error: struct_amdsmi_bdf_t_line = "'struct_amdsmi_bdf_t'," replace_line(output_file, struct_anon_all_line, struct_amdsmi_bdf_t_line) - struct_anon_all_line = f"amdsmi.h:{line_number}:3)', " - replace_line(output_file, struct_anon_all_line, "") + struct_anon_all_line = ", 'struct_struct" + replace_line(output_file, struct_anon_all_line, ",") + + struct_anon_all_line = "(anonymous at " + struct_amdsmi_bdf_t_line = "'struct_amdsmi_bdf_t'," + replace_line(output_file, struct_anon_all_line, struct_amdsmi_bdf_t_line) + + struct_anon_all_line_to_remove = f"amdsmi.h:{line_number}:3)', " + replace_line(output_file, struct_anon_all_line_to_remove, "") + + # Custom handling to ensure amdsmi_get_utilization_count doesn't multiply the struct by 0 + print(f"Replacing amdsmi_get_utilization_count line in {output_file}") + utilization_count_line_bad = "amdsmi_get_utilization_count.argtypes = [amdsmi_processor_handle, struct_amdsmi_utilization_counter_t * 0, uint32_t, ctypes.POINTER(ctypes.c_uint64)]" + utilization_count_line_good = "amdsmi_get_utilization_count.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_utilization_counter_t), uint32_t, ctypes.POINTER(ctypes.c_uint64)]" + replace_line(output_file, utilization_count_line_bad, utilization_count_line_good) if __name__ == "__main__": main()