diff --git a/projects/amdsmi/.azuredevops/rocm-ci.yml b/projects/amdsmi/.azuredevops/rocm-ci.yml new file mode 100644 index 0000000000..7b4d6732c9 --- /dev/null +++ b/projects/amdsmi/.azuredevops/rocm-ci.yml @@ -0,0 +1,42 @@ +resources: + repositories: + - repository: pipelines_repo + type: github + endpoint: ROCm + name: ROCm/ROCm + +variables: +- group: common +- template: /.azuredevops/variables-global.yml@pipelines_repo + +trigger: + batch: true + branches: + include: + - amd-staging + - amd-mainline + paths: + exclude: + - .github + - docs + - '.*.y*ml' + - '*.md' + - LICENSE + +pr: + autoCancel: true + branches: + include: + - amd-staging + - amd-mainline + paths: + exclude: + - .github + - docs + - '.*.y*ml' + - '*.md' + - LICENSE + drafts: false + +jobs: + - template: ${{ variables.CI_COMPONENT_PATH }}/amdsmi.yml@pipelines_repo diff --git a/projects/amdsmi/.clang-format b/projects/amdsmi/.clang-format new file mode 100644 index 0000000000..0bca7eb76f --- /dev/null +++ b/projects/amdsmi/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 100 diff --git a/projects/amdsmi/.clang-tidy b/projects/amdsmi/.clang-tidy new file mode 100644 index 0000000000..45d113dba5 --- /dev/null +++ b/projects/amdsmi/.clang-tidy @@ -0,0 +1,33 @@ +Checks: + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + -abseil*, + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + -clang-analyzer-security.insecureAPI.strcpy, + -clang-diagnostic-sign-conversion, + -clang-diagnostic-unused-parameter, + -cppcoreguidelines*, + -cppcoreguidelines-pro*, + -google-readability*, + -google-runtime-int, + -misc-const-correctness, + -misc-include-cleaner, + -misc-non-copyable-objects, + -misc-unused-parameters, + -misc-use-anonymous-namespace, + -misc-use-internal-linkage, + -modernize-avoid-c-arrays, + -modernize-macro-to-enum, + -modernize-redundant-void-arg, + -modernize-use-auto, + -modernize-use-nodiscard, + -modernize-use-noexcept, + -modernize-use-nullptr, + -modernize-use-trailing-return-type, + -modernize-use-using, + -performance*, + -readability*, diff --git a/projects/amdsmi/.clangd b/projects/amdsmi/.clangd new file mode 100644 index 0000000000..a4a37ac229 --- /dev/null +++ b/projects/amdsmi/.clangd @@ -0,0 +1,42 @@ +CompileFlags: + Remove: -W* + Add: [-Wall, -Wno-c++20-designator, -pedantic, -Wno-sign-conversion] + Compiler: clang++ + +# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html +Diagnostics: + UnusedIncludes: Strict + # rules below are copied into .clang-tidy using ./.update-clang-tidy.sh + # please keep the rules sorted alphabetically + ClangTidy: + Add: [ + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + ] + Remove: [ + abseil*, + bugprone-easily-swappable-parameters, + bugprone-reserved-identifier, + cppcoreguidelines*, + cppcoreguidelines-pro*, + google-readability*, + google-runtime-int, + misc-const-correctness, + misc-include-cleaner, + misc-non-copyable-objects, + misc-unused-parameters, + misc-use-anonymous-namespace, + modernize-avoid-c-arrays, + modernize-redundant-void-arg, + modernize-use-auto, + modernize-use-nodiscard, + modernize-use-noexcept, + modernize-use-nullptr, + modernize-use-trailing-return-type, + modernize-use-using, + performance*, + readability*, + ] diff --git a/projects/amdsmi/.cmake-format b/projects/amdsmi/.cmake-format new file mode 100644 index 0000000000..8c0b3659ef --- /dev/null +++ b/projects/amdsmi/.cmake-format @@ -0,0 +1,253 @@ +# ---------------------------------- +# Options affecting listfile parsing +# ---------------------------------- +with section("parse"): + + # Specify structure for custom cmake functions + additional_commands = { + 'parse_version': { + 'kwargs': { + 'VERSION_STRING': '*' + } + }, + 'get_version_from_tag': { + 'kwargs': { + 'DEFAULT_VERSION_STRING': '*', + 'VERSION_PREFIX': '*', + 'GIT': '*' + } + } + } + + # Override configurations per-command where available + override_spec = {} + + # Specify variable tags. + vartags = [] + + # Specify property tags. + proptags = [] + +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + + # Disable formatting entirely, making cmake-format a no-op + #disable = True + + # How wide to allow formatted cmake files + line_width = 120 + + # How many spaces to tab for indent + tab_size = 4 + + # If true, lines are indented using tab characters (utf-8 0x09) instead of + # space characters (utf-8 0x20). In cases where the layout would + # require a fractional tab character, the behavior of the fractional + # indentation is governed by + use_tabchars = False + + # If is True, then the value of this variable indicates how + # fractional indentions are handled during whitespace replacement. If set to + # 'use-space', fractional indentation is left as spaces (utf-8 0x20). If set + # to `round-up` fractional indentation is replaced with a single tab character + # (utf-8 0x09) effectively shifting the column to the next tabstop + fractional_tab_policy = 'use-space' + + # If an argument group contains more than this many sub-groups (parg or kwarg + # groups) then force it to a vertical layout. + max_subgroups_hwrap = 3 + + # If a positional argument group contains more than this many arguments, then + # force it to a vertical layout. + max_pargs_hwrap = 6 + + # If a cmdline positional group consumes more than this many lines without + # nesting, then invalidate the layout (and nest) + max_rows_cmdline = 2 + + # If true, separate flow control names from their parentheses with a space + separate_ctrl_name_with_space = False + + # If true, separate function names from parentheses with a space + separate_fn_name_with_space = False + + # If a statement is wrapped to more than one line, than dangle the closing + # parenthesis on its own line. + dangle_parens = False + + # If the trailing parenthesis must be 'dangled' on its on line, then align it + # to this reference: `prefix`: the start of the statement, `prefix-indent`: + # the start of the statement, plus one indentation level, `child`: align to + # the column of the arguments + dangle_align = 'prefix' + + # If the statement spelling length (including space and parenthesis) is + # smaller than this amount, then force reject nested layouts. + min_prefix_chars = 4 + + # If the statement spelling length (including space and parenthesis) is larger + # than the tab width by more than this amount, then force reject un-nested + # layouts. + max_prefix_chars = 10 + + # If a candidate layout is wrapped horizontally but it exceeds this many + # lines, then reject the layout. + max_lines_hwrap = 2 + + # What style line endings to use in the output. + line_ending = 'unix' + + # Format command names consistently as 'lower' or 'upper' case + command_case = 'canonical' + + # Format keywords consistently as 'lower' or 'upper' case + keyword_case = 'unchanged' + + # A list of command names which should always be wrapped + always_wrap = ['install'] + + # If true, the argument lists which are known to be sortable will be sorted + # lexicographicall + enable_sort = True + + # If true, the parsers may infer whether or not an argument list is sortable + # (without annotation). + autosort = False + + # By default, if cmake-format cannot successfully fit everything into the + # desired linewidth it will apply the last, most agressive attempt that it + # made. If this flag is True, however, cmake-format will print error, exit + # with non-zero status code, and write-out nothing + require_valid_layout = False + + # A dictionary mapping layout nodes to a list of wrap decisions. See the + # documentation for more information. + layout_passes = {} + +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + + # What character to use for bulleted lists + bullet_char = '*' + + # What character to use as punctuation after numerals in an enumerated list + enum_char = '.' + + # If comment markup is enabled, don't reflow the first comment block in each + # listfile. Use this to preserve formatting of your copyright/license + # statements. + first_comment_is_literal = False + + # If comment markup is enabled, don't reflow any comment block which matches + # this (regex) pattern. Default is `None` (disabled). + literal_comment_pattern = None + + # Regular expression to match preformat fences in comments default= + # ``r'^\s*([`~]{3}[`~]*)(.*)$'`` + fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$' + + # Regular expression to match rulers in comments default= + # ``r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'`` + ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$' + + # If a comment line matches starts with this pattern then it is explicitly a + # trailing comment for the preceeding argument. Default is '#<' + explicit_trailing_pattern = '#<' + + # If a comment line starts with at least this many consecutive hash + # characters, then don't lstrip() them off. This allows for lazy hash rulers + # where the first hash char is not separated by space + hashruler_min_length = 10 + + # If true, then insert a space between the first hash char and remaining hash + # chars in a hash ruler, and normalize its length to fill the column + canonicalize_hashrulers = True + + # enable comment markup parsing and reflow + enable_markup = False + +# ---------------------------- +# Options affecting the linter +# ---------------------------- +with section("lint"): + + # a list of lint codes to disable + disabled_codes = ['C0307', 'C0301', 'C0305'] + + # regular expression pattern describing valid function names + function_pattern = '[0-9a-z_]+' + + # regular expression pattern describing valid macro names + macro_pattern = '[0-9A-Z_]+' + + # regular expression pattern describing valid names for variables with global + # (cache) scope + global_var_pattern = '[A-Z][0-9A-Z_]+' + + # regular expression pattern describing valid names for variables with global + # scope (but internal semantic) + internal_var_pattern = '_[A-Z][0-9A-Z_]+' + + # regular expression pattern describing valid names for variables with local + # scope + local_var_pattern = '[a-z][a-z0-9_]+' + + # regular expression pattern describing valid names for privatedirectory + # variables + private_var_pattern = '_[0-9a-z_]+' + + # regular expression pattern describing valid names for public directory + # variables + public_var_pattern = '[A-Z][0-9A-Z_]+' + + # regular expression pattern describing valid names for function/macro + # arguments and loop variables. + argument_var_pattern = '[a-z][a-z0-9_]+' + + # regular expression pattern describing valid names for keywords used in + # functions or macros + keyword_pattern = '[A-Z][0-9A-Z_]+' + + # In the heuristic for C0201, how many conditionals to match within a loop in + # before considering the loop a parser. + max_conditionals_custom_parser = 2 + + # Require at least this many newlines between statements + min_statement_spacing = 1 + + # Require no more than this many newlines between statements + max_statement_spacing = 2 + max_returns = 6 + max_branches = 12 + max_arguments = 5 + max_localvars = 15 + max_statements = 50 + +# ------------------------------- +# Options affecting file encoding +# ------------------------------- +with section("encode"): + + # If true, emit the unicode byte-order mark (BOM) at the start of the file + emit_byteorder_mark = False + + # Specify the encoding of the input file. Defaults to utf-8 + input_encoding = 'utf-8' + + # Specify the encoding of the output file. Defaults to utf-8. Note that cmake + # only claims to support utf-8 so be careful when using anything else + output_encoding = 'utf-8' + +# ------------------------------------- +# Miscellaneous configurations options. +# ------------------------------------- +with section("misc"): + + # A dictionary containing any per-command configuration overrides. Currently + # only `command_case` is supported. + per_command = {} + diff --git a/projects/amdsmi/.editorconfig b/projects/amdsmi/.editorconfig new file mode 100644 index 0000000000..64cc5d6985 --- /dev/null +++ b/projects/amdsmi/.editorconfig @@ -0,0 +1,16 @@ +# EditorConfig standardizes spacing in all editors: https://EditorConfig.org +# Please get a plugin for your editor to match the formatting + +# top-most EditorConfig file +root = true + +[*.py] +indent_style = space + +# Matches multiple files with brace expansion notation +# Set default charset +[*.{c,cc,cpp,h,hh,hpp}] +charset = utf-8 +indent_style = space +indent_size = 2 +max_line_length = 100 diff --git a/projects/amdsmi/.github/CODEOWNERS b/projects/amdsmi/.github/CODEOWNERS new file mode 100644 index 0000000000..d160084c6c --- /dev/null +++ b/projects/amdsmi/.github/CODEOWNERS @@ -0,0 +1,6 @@ +* @maisarif_amdeng @shuzhliu_amdeng @dgalants_amdeng @charpoag_amdeng @daolivei_amdeng @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan @gabrpham_amdeng + +docs/* @ROCm/rocm-documentation +*.md @ROCm/rocm-documentation +*.rst @ROCm/rocm-documentation + diff --git a/projects/amdsmi/.github/CONTRIBUTING.md b/projects/amdsmi/.github/CONTRIBUTING.md new file mode 100644 index 0000000000..3b2c2f4f31 --- /dev/null +++ b/projects/amdsmi/.github/CONTRIBUTING.md @@ -0,0 +1,84 @@ +# Contributing to AMD SMI # + +We welcome contributions to AMD SMI. +Please follow these details to help ensure your contributions will be successfully accepted. + +## Issue Discussion ## + +Please use the GitHub Issues tab to notify us of issues. + +* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and + comment or post to provide additional details, such as how you reproduced this issue. +* If you're not sure if your issue is the same, err on the side of caution and file your issue. + You can add a comment to include the issue number (and link) for the similar issue. If we evaluate + your issue as being the same as the existing issue, we'll close the duplicate. +* If your issue doesn't exist, use the issue template to file a new issue. + * When filing an issue, be sure to provide as much information as possible, including script output so + we can collect information about your configuration. This helps reduce the time required to + reproduce your issue. + * Check your issue regularly, as we may require additional information to successfully reproduce the + issue. +* You may also open an issue to ask questions to the maintainers about whether a proposed change + meets the acceptance criteria, or to discuss an idea pertaining to the library. + +## Acceptance Criteria ## + +The goal of AMD SMI project is to provide a simple CLI interface and a library +for interacting with AMD GPUs. + +## Coding Style ## + +Please refer to `.clang-format`. It is suggested you use `pre-commit` tool. +It mostly follows Google C++ formatting with 100 character line limit. + +## Pull Request Guidelines ## + +When you create a pull request, you should target the default branch. Our +current default branch is the **amd-staging** branch, which serves as our +integration branch. + +### Deliverables ### + +For each new file in repository, +Please include the licensing header + + /* + * ============================================================================= + * Copyright (c) 2019-2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +### Process ### + +* Reviewers are listed in the CODEOWNERS file +* Code format guidelines + +AMD SMI uses the clang-format tool for formatting code in source files. +The formatting style is captured in .clang-format which is located at +the root of AMD SMI. These are different options to follow: + + 1. Using pre-commit and docker - `pre-commit run` + 1. Using only clang-format - `clang-format -i \` + +## References ## + +1. [pre-commit](https://github.com/pre-commit/pre-commit) +1. [clang-format](https://clang.llvm.org/docs/ClangFormat.html) diff --git a/projects/amdsmi/.github/dependabot.yml b/projects/amdsmi/.github/dependabot.yml new file mode 100644 index 0000000000..1cb3cd99b9 --- /dev/null +++ b/projects/amdsmi/.github/dependabot.yml @@ -0,0 +1,18 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/docs/sphinx" # Location of package manifests + open-pull-requests-limit: 10 + schedule: + interval: "weekly" + target-branch: "amd-staging" + labels: + - "documentation" + - "dependencies" + reviewers: + - "petepark_amdeng" diff --git a/projects/amdsmi/.github/palamida.yml b/projects/amdsmi/.github/palamida.yml new file mode 100644 index 0000000000..47bd57a5ab --- /dev/null +++ b/projects/amdsmi/.github/palamida.yml @@ -0,0 +1,5 @@ +disabled: false +scmId: gh-emu-rocm +branchesToScan: + - amd-staging + - amd-mainline \ No newline at end of file diff --git a/projects/amdsmi/.github/workflows/abi-compliance-check.yml b/projects/amdsmi/.github/workflows/abi-compliance-check.yml new file mode 100644 index 0000000000..f36551fb6c --- /dev/null +++ b/projects/amdsmi/.github/workflows/abi-compliance-check.yml @@ -0,0 +1,314 @@ +name: ABI Compliance Check + +on: + pull_request: + branches: + - amd-staging + - release/rocm-rel-* + paths: + - 'include/amd_smi/amdsmi.h' + push: + branches: + - amd-staging + - release/rocm-rel-* + paths: + - 'include/amd_smi/amdsmi.h' + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +jobs: + major_abi_check: + name: Major ABI Compliance Check + runs-on: AMD-ROCm-Internal-dev1 + steps: + - name: Setup Environment + run: | + sudo rm -rf $GITHUB_WORKSPACE/* || true + sudo rm -rf $GITHUB_WORKSPACE/.[!.]* || true + sudo apt-get update -qq + sudo apt-get install -y -qq perl build-essential git universal-ctags + git clone https://github.com/lvc/abi-compliance-checker.git + cd abi-compliance-checker + sudo make install + abi-compliance-checker --version + + - name: Checkout current code (new version) + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha || github.sha }} + + - name: Fetch base branch for PR + if: github.event_name == 'pull_request' + run: | + echo "Fetching base branch: ${{ github.base_ref }}" + git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + git branch -a + + - name: Prepare amdsmi.h files for comparison + id: prepare_files + run: | + echo "Preparing amdsmi.h files..." + echo "abi_exit_code=1" > $GITHUB_WORKSPACE/major_abi_status.txt + + if [ -f include/amd_smi/amdsmi.h ]; then + cp include/amd_smi/amdsmi.h amdsmi_new.h + echo "Copied current amdsmi.h to amdsmi_new.h" + else + echo "::error::New amdsmi.h (include/amd_smi/amdsmi.h) not found in current checkout." + touch amdsmi_new.h + exit 0 + fi + + OLD_VERSION_REF="" + V1_NAME_SUFFIX="" + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + OLD_VERSION_REF="${{ github.base_ref }}" + V1_NAME_SUFFIX="base_${{ github.base_ref }}" + echo "Event is Pull Request. Old version source is base branch: ${OLD_VERSION_REF}" + elif [[ "${{ github.event_name }}" == "push" ]]; then + if [[ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]]; then + OLD_VERSION_REF="${{ github.event.before }}" + V1_NAME_SUFFIX="before_$(echo ${{ github.event.before }} | cut -c1-7)" + echo "Event is Push. Old version source is commit before push: ${OLD_VERSION_REF}" + else + echo "Push event is for a new branch or forced push. Cannot determine 'old' version." + touch amdsmi_old.h + echo "Created dummy amdsmi_old.h. Assuming no ABI breakage as no baseline." + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/major_abi_status.txt + echo "skip_check=true" >> $GITHUB_OUTPUT + exit 0 + fi + else + echo "::warning::Unsupported event type: ${{ github.event_name }}. Cannot determine old version." + touch amdsmi_old.h + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/major_abi_status.txt + echo "skip_check=true" >> $GITHUB_OUTPUT + exit 0 + fi + + echo "Fetching amdsmi.h from ref: $OLD_VERSION_REF (as amdsmi_old.h)" + git show $OLD_VERSION_REF:include/amd_smi/amdsmi.h > amdsmi_old.h 2>/dev/null + if [ $? -ne 0 ] || [ ! -s amdsmi_old.h ]; then + echo "::warning::Failed to fetch 'include/amd_smi/amdsmi.h' from ref '$OLD_VERSION_REF' or file is empty/missing." + echo "Proceeding with an empty amdsmi_old.h. This may result in all symbols reported as 'added'." + echo -n "" > amdsmi_old.h + if [ ! -s amdsmi_new.h ]; then + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/major_abi_status.txt + fi + else + echo "Successfully fetched amdsmi.h from $OLD_VERSION_REF to amdsmi_old.h" + fi + echo "v1_name_suffix=${V1_NAME_SUFFIX}" >> $GITHUB_OUTPUT + echo "skip_check=false" >> $GITHUB_OUTPUT + + - name: Run Major ABI Compliance Check + if: steps.prepare_files.outputs.skip_check == 'false' + run: | + V1_NAME_SUFFIX_CLEAN=$(echo "${{ steps.prepare_files.outputs.v1_name_suffix }}" | tr '/' '-') + V2_NAME_CLEAN=$(echo "${{ github.ref_name || github.head_ref }}" | tr '/' '-') + + echo "Comparing $V1_NAME_SUFFIX_CLEAN (old) with $V2_NAME_CLEAN (new) for Major ABI Check" + abi-compliance-checker -lib amdsmi -old amdsmi_old.h -new amdsmi_new.h -v1 "$V1_NAME_SUFFIX_CLEAN" -v2 "$V2_NAME_CLEAN" -report-path major-abi-report.html && echo "abi_exit_code=0" > $GITHUB_WORKSPACE/major_abi_status.txt + continue-on-error: true + + - name: Display ABI Check Logs (Major) + if: always() && steps.prepare_files.outputs.skip_check == 'false' + run: | + echo "Displaying Major ABI compliance check logs (if any)" + find logs -type f -name "*.txt" -exec echo "--- {} ---" \; -exec cat {} \; || echo "No .txt logs found in logs/ directory." + + - name: Label PR on Major ABI Breakage + if: always() && github.event_name == 'pull_request' + run: | + source $GITHUB_WORKSPACE/major_abi_status.txt + if [ "$abi_exit_code" -ne 0 ]; then + echo "Major ABI check failed, adding 'MAJOR ABI BREAKAGE' label to PR #${{ github.event.pull_request.number }}" + gh pr edit ${{ github.event.pull_request.number }} --add-label "MAJOR ABI BREAKAGE" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Major ABI Report + if: always() + uses: actions/upload-artifact@v4 + with: + name: major-abi-report + path: major-abi-report.html + if-no-files-found: ignore + + - name: Report Major ABI Check Results + if: always() + run: | + echo "Checking Major ABI check exit code..." + source $GITHUB_WORKSPACE/major_abi_status.txt + echo "Major ABI check exit code: $abi_exit_code" + if [ "$abi_exit_code" -ne 0 ]; then + echo "::warning::⚠️ MAJOR ABI BREAKAGE FOUND ⚠️ CHECK \"Run Major ABI Compliance Check\" LOGS OR THE major-abi-report ARTIFACT FOR DETAILS." + else + echo "✅ Major ABI check succeeded." + fi + + minor_abi_check: + name: Minor ABI Compliance Check + runs-on: AMD-ROCm-Internal-dev1 + steps: + - name: Setup Environment + run: | + sudo rm -rf $GITHUB_WORKSPACE/* || true + sudo rm -rf $GITHUB_WORKSPACE/.[!.]* || true + sudo apt-get update -qq + sudo apt-get install -y -qq perl build-essential git universal-ctags + git clone https://github.com/lvc/abi-compliance-checker.git + cd abi-compliance-checker + sudo make install + abi-compliance-checker --version + + - name: Checkout current code (new version) + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha || github.sha }} + + - name: Fetch base branch for PR + if: github.event_name == 'pull_request' + run: | + echo "Fetching base branch: ${{ github.base_ref }}" + git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + git branch -a + + - name: Prepare amdsmi.h files for comparison + id: prepare_files_minor + run: | + echo "Preparing amdsmi.h files for Minor check..." + echo "abi_exit_code=1" > $GITHUB_WORKSPACE/minor_abi_status.txt + + if [ -f include/amd_smi/amdsmi.h ]; then + cp include/amd_smi/amdsmi.h amdsmi_new.h + echo "Copied current amdsmi.h to amdsmi_new.h for Minor check" + else + echo "::error::New amdsmi.h (include/amd_smi/amdsmi.h) not found in current checkout for Minor check." + touch amdsmi_new.h + exit 0 + fi + + OLD_VERSION_REF_MINOR="" + V1_NAME_SUFFIX_MINOR="" + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + OLD_VERSION_REF_MINOR="${{ github.base_ref }}" + V1_NAME_SUFFIX_MINOR="base_${{ github.base_ref }}" + elif [[ "${{ github.event_name }}" == "push" ]]; then + if [[ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]]; then + OLD_VERSION_REF_MINOR="${{ github.event.before }}" + V1_NAME_SUFFIX_MINOR="before_$(echo ${{ github.event.before }} | cut -c1-7)" + else + echo "Push event is for a new branch (Minor check). Assuming no ABI changes as no baseline." + touch amdsmi_old.h + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/minor_abi_status.txt + echo "skip_check_minor=true" >> $GITHUB_OUTPUT + exit 0 + fi + else + echo "::warning::Unsupported event type for Minor ABI check: ${{ github.event_name }}." + touch amdsmi_old.h + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/minor_abi_status.txt + echo "skip_check_minor=true" >> $GITHUB_OUTPUT + exit 0 + fi + + echo "Fetching amdsmi.h from ref: $OLD_VERSION_REF_MINOR (as amdsmi_old.h) for Minor check" + git show $OLD_VERSION_REF_MINOR:include/amd_smi/amdsmi.h > amdsmi_old.h 2>/dev/null + if [ $? -ne 0 ] || [ ! -s amdsmi_old.h ]; then + echo "::warning::Failed to fetch 'include/amd_smi/amdsmi.h' from ref '$OLD_VERSION_REF_MINOR' or file is empty/missing for Minor check." + echo "Proceeding with an empty amdsmi_old.h for Minor check." + echo -n "" > amdsmi_old.h + if [ ! -s amdsmi_new.h ]; then + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/minor_abi_status.txt + fi + else + echo "Successfully fetched amdsmi.h from $OLD_VERSION_REF_MINOR to amdsmi_old.h for Minor check" + fi + echo "v1_name_suffix_minor=${V1_NAME_SUFFIX_MINOR}" >> $GITHUB_OUTPUT + echo "skip_check_minor=false" >> $GITHUB_OUTPUT + + - name: Run Minor ABI Compliance Check (Strict) + if: steps.prepare_files_minor.outputs.skip_check_minor == 'false' + run: | + V1_NAME_SUFFIX_CLEAN=$(echo "${{ steps.prepare_files_minor.outputs.v1_name_suffix_minor }}" | tr '/' '-') + V2_NAME_CLEAN=$(echo "${{ github.ref_name || github.head_ref }}" | tr '/' '-') + COMPARE_MSG="$V1_NAME_SUFFIX_CLEAN vs $V2_NAME_CLEAN" + + echo "Comparing $COMPARE_MSG for Minor ABI Check (Strict)" + + abi-compliance-checker -lib amdsmi -old amdsmi_old.h -new amdsmi_new.h -v1 "$V1_NAME_SUFFIX_CLEAN" -v2 "$V2_NAME_CLEAN" -report-path minor-abi-report.html -strict || { + ACC_EXIT_CODE=$? + echo "abi-compliance-checker -strict failed with exit code $ACC_EXIT_CODE." + echo "abi_exit_code=$ACC_EXIT_CODE" > $GITHUB_WORKSPACE/minor_abi_status.txt + } + + current_abi_status=$(cat $GITHUB_WORKSPACE/minor_abi_status.txt) + current_exit_code=${current_abi_status#*=} + + if [ "$current_exit_code" -eq 0 ] && [ -f minor-abi-report.html ]; then + echo "ACC strict check passed. Parsing HTML report for any changes..." + CHANGED=0 + if grep -q "Added Symbols.*[1-9]" minor-abi-report.html; then CHANGED=1; echo "::warning::STRICT ABI: Found added symbols"; fi + if grep -q "Removed Symbols.*[1-9]" minor-abi-report.html; then CHANGED=1; echo "::warning::STRICT ABI: Found removed symbols"; fi + if grep -q "Problems with.*Data Types.*[1-9]" minor-abi-report.html; then CHANGED=1; echo "::warning::STRICT ABI: Found problems with data types"; fi + if grep -q "Problems with.*Symbols.*[1-9]" minor-abi-report.html; then CHANGED=1; echo "::warning::STRICT ABI: Found problems with symbols"; fi + if grep -q "Problems with.*Constants.*[1-9]" minor-abi-report.html; then CHANGED=1; echo "::warning::STRICT ABI: Found problems with constants"; fi + + if [ "$CHANGED" -eq 1 ]; then + echo "::error::STRICT ABI CHECK FAILED: Found changes in ABI report comparing $COMPARE_MSG" + echo "abi_exit_code=1" > $GITHUB_WORKSPACE/minor_abi_status.txt + else + echo "No strict ABI changes found in HTML report." + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/minor_abi_status.txt + fi + elif [ ! -f minor-abi-report.html ] && [ "$current_exit_code" -eq 0 ]; then + echo "::warning::Minor ABI report (minor-abi-report.html) not found, but ACC reported success. Assuming no changes." + echo "abi_exit_code=0" > $GITHUB_WORKSPACE/minor_abi_status.txt + elif [ "$current_exit_code" -ne 0 ]; then + echo "ACC strict check already indicated failure (exit code $current_exit_code). HTML parsing for further changes skipped or confirmed failure." + fi + continue-on-error: true + + - name: Display ABI Check Logs (Minor) + if: always() && steps.prepare_files_minor.outputs.skip_check_minor == 'false' + run: | + echo "Displaying Minor ABI compliance check logs (if any)" + find logs -type f -name "*.txt" -exec echo "--- {} ---" \; -exec cat {} \; || echo "No .txt logs found in logs/ directory." + + - name: Label PR on Minor ABI Breakage + if: always() && github.event_name == 'pull_request' + run: | + source $GITHUB_WORKSPACE/minor_abi_status.txt + if [ "$abi_exit_code" -ne 0 ]; then + echo "Minor ABI check failed, adding 'MINOR ABI BREAKAGE' label to PR #${{ github.event.pull_request.number }}" + gh pr edit ${{ github.event.pull_request.number }} --add-label "MINOR ABI BREAKAGE" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Minor ABI Report + if: always() + uses: actions/upload-artifact@v4 + with: + name: minor-abi-report + path: minor-abi-report.html + if-no-files-found: ignore + + - name: Report Minor ABI Check Results + if: always() + run: | + echo "Checking Minor ABI check exit code..." + source $GITHUB_WORKSPACE/minor_abi_status.txt + echo "Minor ABI check exit code: $abi_exit_code" + if [ "$abi_exit_code" -ne 0 ]; then + echo "::warning::⚠️ MINOR ABI CHANGES FOUND (STRICT CHECK) ⚠️ CHECK \"Run Minor ABI Compliance Check (Strict)\" LOGS OR THE minor-abi-report ARTIFACT FOR DETAILS." + else + echo "✅ Minor ABI check (Strict) succeeded or found no changes." + fi \ No newline at end of file diff --git a/projects/amdsmi/.github/workflows/amdsmi-build.yml b/projects/amdsmi/.github/workflows/amdsmi-build.yml new file mode 100644 index 0000000000..ef8c20bdb2 --- /dev/null +++ b/projects/amdsmi/.github/workflows/amdsmi-build.yml @@ -0,0 +1,836 @@ +name: AMDSMI CI + +on: + pull_request: + branches: [amd-staging, amd-mainline, release/rocm-rel-*] + push: + branches: [amd-staging, amd-mainline, release/rocm-rel-*] + workflow_dispatch: + +permissions: + contents: read +env: + DEBIAN_FRONTEND: noninteractive + DEBCONF_NONINTERACTIVE_SEEN: true + BUILD_TYPE: Release + ROCM_DIR: /opt/rocm + +jobs: + debian-buildinstall: + name: Build + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: [Ubuntu20, Ubuntu22, Debian10] + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules + + steps: + - uses: actions/checkout@v4 + + - name: Update repositories for Debian10 + if: matrix.os == 'Debian10' + run: | + set -e + echo 'Updating repositories for Debian10 (archived)' + cat > /etc/apt/sources.list << EOF + deb http://archive.debian.org/debian buster main + deb http://archive.debian.org/debian-security buster/updates main + EOF + echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until + apt update + + - name: Build AMDSMI + run: | + set -e + echo 'Building on ${{ matrix.os }}' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }}..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + # Capture build output to parse warnings + if cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ + make -j $(nproc) 2>&1 | tee make.log && \ + make package 2>&1 | tee package.log; then + + # Parse and report warnings as GitHub annotations + echo "::group::Build Warnings" + grep -i "warning" cmake.log make.log package.log | while read -r line; do + echo "::warning::$line" + done + echo "::endgroup::" + + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Install AMDSMI + run: | + cd $GITHUB_WORKSPACE/build + if [ "${{ matrix.os }}" != "Debian10" ]; then + apt update + fi + + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Installation attempt $i for ${{ matrix.os }}..." + if apt install -y ./amd-smi-lib*99999-local_amd64.deb; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + # Verify Installation + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on ${{ matrix.os }}' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Uninstall + if: always() + run: | + set -e + echo 'Uninstalling on ${{ matrix.os }}' + apt remove -y amd-smi-lib || true + rm -f /usr/local/bin/amd-smi + if [ -d /opt/rocm/share/amd_smi ]; then + echo '/opt/rocm/share/amd_smi exists. Removing.' + rm -rf /opt/rocm/share/amd_smi + fi + echo 'Uninstall done on ${{ matrix.os }}' + + debian-test: + name: Tests + needs: debian-buildinstall + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: [Ubuntu20, Ubuntu22, Debian10] + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules + + steps: + - uses: actions/checkout@v4 + + - name: Update repositories for Debian10 + if: matrix.os == 'Debian10' + run: | + set -e + echo 'Updating repositories for Debian10 (archived)' + cat > /etc/apt/sources.list << EOF + deb http://archive.debian.org/debian buster main + deb http://archive.debian.org/debian-security buster/updates main + EOF + echo 'Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/99-disable-check-valid-until + apt update + + - name: Build and Install for Test + run: | + set -e + echo 'Building for test on ${{ matrix.os }}' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }} test..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + echo 'Installing for test on ${{ matrix.os }}' + for i in $(seq 1 $RETRIES); do + echo "Installation attempt $i for test on ${{ matrix.os }}..." + if apt install -y $BUILD_FOLDER/amd-smi-lib*99999-local_amd64.deb; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on ${{ matrix.os }}' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: AMDSMI Command Tests + shell: bash + run: | + set -e + echo "Running AMDSMI commands on ${{ matrix.os }}" + mkdir -p /tmp/test-results-${{ matrix.os }} + commands=( + "amd-smi version" + "amd-smi list" + "amd-smi static" + "amd-smi firmware" + "amd-smi ucode" + "amd-smi bad-pages" + "amd-smi metric" + "amd-smi process" + "amd-smi topology" + "amd-smi monitor" + "amd-smi dmon" + "amd-smi xgmi" + "amd-smi partition" + ) + for cmd in "${commands[@]}"; do + debug_cmd="$cmd --loglevel debug" + echo "Running: $debug_cmd" + if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then + echo "Command '$debug_cmd' failed." + cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log + exit 1 + else + echo "$debug_cmd passed." + fi + done + echo "AMDSMI commands done on ${{ matrix.os }}" + + - name: Upload AMDSMI Command Test Results + if: always() + uses: actions/upload-artifact@v4 + with: + name: amdsmi-command-tests-${{ matrix.os }} + path: /tmp/test-results-${{ matrix.os }} + + - name: Run AMDSMI, Python, and Example Tests + shell: bash + run: | + set -e + echo 'Running other tests on ${{ matrix.os }}' + + # AMDSMI Tests + echo 'Running AMDSMI tests' + cd /opt/rocm/share/amd_smi/tests + source amdsmitst.exclude + + AMDSMI_RETRIES=3 + for attempt in $(seq 1 $AMDSMI_RETRIES); do + echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." + if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then + echo "AMDSMI tests passed on attempt $attempt" + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests done" + break + else + TEST_EXIT_CODE=$? + echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" + if [ $attempt -eq $AMDSMI_RETRIES ]; then + echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests failed" + exit $TEST_EXIT_CODE + else + echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." + sleep $((2 * attempt)) + fi + fi + done + + # Python Tests + echo 'Running Python tests' + cd /opt/rocm/share/amd_smi/tests/python_unittest + echo "Running integration tests..." + if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then + echo "Integration tests failed!" + echo "=============== INTEGRATION TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt + echo "=======================================================" + exit 1 + else + echo "Integration tests passed" + fi + + echo "Running unit tests..." + if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then + echo "Unit tests failed!" + echo "=============== UNIT TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt + echo "================================================" + exit 1 + else + echo "Unit tests passed" + fi + + echo "Python tests done" + + # Example Tests + echo 'Running Example tests' + cd $GITHUB_WORKSPACE/example + rm -rf build + cmake -B build -DENABLE_ESMI_LIB=OFF + make -C build -j $(nproc) + cd build + ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' + ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' + echo "Example tests done" + + - name: AMDSMI Test Results + if: always() + run: | + echo "Displaying AMDSMI test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" + + - name: Integration Test Results + if: always() + run: | + echo "Displaying Integration test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" + + - name: Unit Test Results + if: always() + run: | + echo "Displaying Unit Test Results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" + + - name: Example DRM Test Results + if: always() + run: | + echo "Displaying Example DRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" + + - name: Example NoDRM Test Results + if: always() + run: | + echo "Displaying Example NoDRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}" + + rpm-buildinstall: + name: Build + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: + - SLES + - RHEL8 + - RHEL9 + - RHEL10 + - AzureLinux3 + - AlmaLinux8 + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules + + steps: + - uses: actions/checkout@v4 + + - name: Set PkgMgr + run: | + set -e + case "${{ matrix.os }}" in + SLES) + echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV + ;; + esac + + - name: Add more_itertools + if: matrix.os == 'AzureLinux3' + run: | + set -e + echo 'Installing more_itertools on ${{ matrix.os }}' + python3 -m pip install more_itertools + + - name: Build AMDSMI(RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + set -e + echo 'Building on ${{ matrix.os }} with retries and QA_RPATHS' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + RETRIES=5 + + # Set QA_RPATHS to ignore empty (0x0010) and invalid (0x0002) RPATHs + export QA_RPATHS=$((0x0010 | 0x0002)) + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }} ..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Build AMDSMI + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + set -e + echo 'Building on ${{ matrix.os }}' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + RETRIES=3 + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for ${{ matrix.os }}..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + # Capture build output to parse warnings + if cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON 2>&1 | tee cmake.log && \ + make -j $(nproc) 2>&1 | tee make.log && \ + make package 2>&1 | tee package.log; then + + # Parse and report warnings as GitHub annotations + echo "::group::Build Warnings" + grep -i "warning" cmake.log make.log package.log | while read -r line; do + echo "::warning::$line" + done + echo "::endgroup::" + + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + echo "Build completed on ${{ matrix.os }}" + + - name: Install AMDSMI(RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + cd $GITHUB_WORKSPACE/build + dnf install python3-setuptools python3-wheel -y + + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "RHEL10: Installation attempt $i..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on RHEL10' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: Install AMDSMI + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + cd $GITHUB_WORKSPACE/build + case ${{ env.PACKAGE_MANAGER }} in + zypper) + timeout 10m zypper --no-refresh --no-gpg-checks install -y ./amd-smi-lib-*99999-local*.rpm + ;; + dnf) + dnf install python3-setuptools python3-wheel -y + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Attempt $i: Installing AMDSMI package..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* ./amd-smi-lib-*99999-local*.rpm; then + echo "AMDSMI package installed successfully." + break + else + echo "Installation failed on attempt $i. Retrying..." + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES attempts failed. Exiting." + exit 1 + fi + sleep 10 + fi + done + ;; + esac + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + + # Verify Installation + echo 'Verifying installation:' + amd-smi version + python3 -m pip list | grep amd + python3 -m pip list | grep pip + python3 -m pip list | grep setuptools + echo 'Completed installation on ${{ matrix.os }}' + + - name: Uninstall + if: always() + run: | + set -e + echo 'Uninstalling on ${{ matrix.os }}' + case ${{ matrix.os }} in + SLES) + zypper remove -y amd-smi-lib || true + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + dnf remove -y amd-smi-lib || true + ;; + esac + rm -f /usr/local/bin/amd-smi + if [ -d /opt/rocm/share/amd_smi ]; then + echo '/opt/rocm/share/amd_smi exists. Removing.' + rm -rf /opt/rocm/share/amd_smi + fi + echo 'Uninstall done on ${{ matrix.os }}' + + rpm-test: + name: Tests + needs: [rpm-buildinstall, debian-test] # debian-test is needed to complete before rpm-test starts (see comment about driver reloads) + runs-on: + - self-hosted + - ${{ vars.RUNNER_TYPE }} + continue-on-error: true + strategy: + max-parallel: 10 + matrix: + os: + - SLES + - RHEL8 + - RHEL9 + - RHEL10 + - AzureLinux3 + - AlmaLinux8 + container: + image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }} + options: --rm --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=64G --cap-add=SYS_MODULE -v /lib/modules:/lib/modules + + steps: + - uses: actions/checkout@v4 + + - name: Set PkgMgr + run: | + set -e + case "${{ matrix.os }}" in + SLES) + echo "PACKAGE_MANAGER=zypper" >> $GITHUB_ENV + ;; + RHEL8|RHEL9|RHEL10|AlmaLinux8|AzureLinux3) + echo "PACKAGE_MANAGER=dnf" >> $GITHUB_ENV + ;; + esac + + - name: Add more_itertools + if: matrix.os == 'AzureLinux3' + run: | + set -e + echo 'Installing more_itertools on ${{ matrix.os }}' + python3 -m pip install more_itertools + + - name: Build and Install for Tests (RHEL10 & AlmaLinux8) + if: matrix.os == 'RHEL10' || matrix.os == 'AlmaLinux8' + run: | + set -e + echo 'Building for test on RHEL10/AlmaLinux8 with retries and QA_RPATHS' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + RETRIES=5 + + # Set QA_RPATHS to ignore empty (0x0010 | 0x0002) RPATHs + export QA_RPATHS=$((0x0010 | 0x0002)) + + for i in $(seq 1 $RETRIES); do + echo "Build attempt $i for RHEL10/AlmaLinux8 test..." + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + + if cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) && \ + make package; then + echo "Build successful on attempt $i" + break + else + echo "Build failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES build attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + echo 'Installing for test on RHEL10/AlmaLinux8' + dnf install python3-setuptools python3-wheel -y + + for i in $(seq 1 $RETRIES); do + echo "RHEL10/AlmaLinux8: Installation attempt $i for test..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then + echo "Installation successful on attempt $i" + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on RHEL10/AlmaLinux8' + break + else + echo "Installation failed on attempt $i" + if [ $i -eq $RETRIES ]; then + echo "All $RETRIES installation attempts failed. Exiting." + exit 1 + fi + sleep $((2 * i)) + fi + done + + - name: Build and Install for Tests + if: matrix.os != 'RHEL10' && matrix.os != 'AlmaLinux8' + run: | + set -e + echo 'Building for test on ${{ matrix.os }}' + BUILD_FOLDER=$GITHUB_WORKSPACE/build + rm -rf $BUILD_FOLDER + mkdir -p $BUILD_FOLDER + cd $BUILD_FOLDER + cmake $GITHUB_WORKSPACE -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON + make -j $(nproc) + make package + + echo 'Installing for test on ${{ matrix.os }}' + case ${{ env.PACKAGE_MANAGER }} in + zypper) + timeout 10m zypper --no-refresh --no-gpg-checks install -y $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm + ;; + dnf) + dnf install python3-setuptools python3-wheel -y + RETRIES=3 + for i in $(seq 1 $RETRIES); do + echo "Attempt $i: Installing..." + if timeout 10m dnf install -y --skip-broken --disablerepo=* $BUILD_FOLDER/amd-smi-lib-*99999-local*.rpm; then + echo "Install successful." + break + else + echo "Attempt $i failed. Retrying..." + if [ $i -eq $RETRIES ]; then + echo "All attempts failed." + exit 1 + fi + sleep 10 + fi + done + ;; + esac + ln -s /opt/rocm/bin/amd-smi /usr/local/bin + echo 'Install done for test on ${{ matrix.os }}' + + - name: AMDSMI Command Tests + shell: bash + run: | + set -e + echo "Running AMDSMI commands on ${{ matrix.os }}" + mkdir -p /tmp/test-results-${{ matrix.os }} + commands=( + "amd-smi version" + "amd-smi list" + "amd-smi static" + "amd-smi firmware" + "amd-smi ucode" + "amd-smi bad-pages" + "amd-smi metric" + "amd-smi process" + "amd-smi topology" + "amd-smi monitor" + "amd-smi dmon" + "amd-smi xgmi" + "amd-smi partition" + ) + for cmd in "${commands[@]}"; do + debug_cmd="$cmd --loglevel debug" + echo "Running: $debug_cmd" + if ! eval "$debug_cmd" > /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log 2>&1; then + echo "Command '$debug_cmd' failed." + cat /tmp/test-results-${{ matrix.os }}/$(echo $cmd | tr ' ' '_').log + exit 1 + else + echo "$debug_cmd passed." + fi + done + echo "AMDSMI commands done on ${{ matrix.os }}" + + - name: Upload AMDSMI Command Test Results + if: always() + uses: actions/upload-artifact@v4 + with: + name: amdsmi-command-tests-${{ matrix.os }} + path: /tmp/test-results-${{ matrix.os }} + + - name: Run AMDSMI, Python, and Example Tests + shell: bash + run: | + set -e + echo 'Running other tests on ${{ matrix.os }}' + + # AMDSMI Tests + echo 'Running AMDSMI tests' + cd /opt/rocm/share/amd_smi/tests + source amdsmitst.exclude + + AMDSMI_RETRIES=3 + for attempt in $(seq 1 $AMDSMI_RETRIES); do + echo "AMDSMI test attempt $attempt for ${{ matrix.os }}..." + if ./amdsmitst --gtest_filter="-$(echo ${BLACKLIST_ALL_ASICS})" > /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log 2>&1; then + echo "AMDSMI tests passed on attempt $attempt" + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests done" + break + else + TEST_EXIT_CODE=$? + echo "AMDSMI tests failed on attempt $attempt with exit code $TEST_EXIT_CODE" + if [ $attempt -eq $AMDSMI_RETRIES ]; then + echo "All $AMDSMI_RETRIES AMDSMI test attempts failed. Final failure." + echo "=============== TEST OUTPUT ===============" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log | grep -E "\[==========\]|\[ PASSED \]|\[ SKIPPED \]|\[ FAILED \]" + echo "==============================================" + echo "AMDSMI tests failed" + exit $TEST_EXIT_CODE + else + echo "Retrying AMDSMI tests in $((2 * attempt)) seconds..." + sleep $((2 * attempt)) + fi + fi + done + + # Python Tests + echo 'Running Python tests' + cd /opt/rocm/share/amd_smi/tests/python_unittest + echo "Running integration tests..." + if ! ./integration_test.py -v > /tmp/test-results-${{ matrix.os }}/integration_test_output.txt 2>&1; then + echo "Integration tests failed!" + echo "=============== INTEGRATION TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/integration_test_output.txt + echo "=======================================================" + exit 1 + else + echo "Integration tests passed" + fi + + echo "Running unit tests..." + if ! ./unit_tests.py -v > /tmp/test-results-${{ matrix.os }}/unit_test_output.txt 2>&1; then + echo "Unit tests failed!" + echo "=============== UNIT TEST OUTPUT ===============" + tail -100 /tmp/test-results-${{ matrix.os }}/unit_test_output.txt + echo "================================================" + exit 1 + else + echo "Unit tests passed" + fi + + echo "Python tests done" + + # Example Tests + echo 'Running Example tests' + cd $GITHUB_WORKSPACE/example + rm -rf build + cmake -B build -DENABLE_ESMI_LIB=OFF + make -C build -j $(nproc) + cd build + ./amd_smi_drm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log 2>&1 || echo 'amd_smi_drm_ex failed' + ./amd_smi_nodrm_ex > /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log 2>&1 || echo 'amd_smi_nodrm_ex failed' + echo "Example tests done" + + - name: AMDSMI Test Results + if: always() + run: | + echo "Displaying AMDSMI test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amdsmi_tests.log || echo "No AMDSMI test results found for ${{ matrix.os }}" + + - name: Integration Test Results + if: always() + run: | + echo "Displaying Integration test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/integration_test_output.txt || echo "No integration test results found for ${{ matrix.os }}" + + - name: Unit Test Results + if: always() + run: | + echo "Displaying Unit Test Results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/unit_test_output.txt || echo "No unit test results found for ${{ matrix.os }}" + + - name: Example DRM Test Results + if: always() + run: | + echo "Displaying Example DRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_drm_ex.log || echo "No DRM example test results found for ${{ matrix.os }}" + + - name: Example NoDRM Test Results + if: always() + run: | + echo "Displaying Example NoDRM test results for ${{ matrix.os }}" + cat /tmp/test-results-${{ matrix.os }}/amd_smi_nodrm_ex.log || echo "No NoDRM example test results found for ${{ matrix.os }}" diff --git a/projects/amdsmi/.github/workflows/auto-label.yml b/projects/amdsmi/.github/workflows/auto-label.yml new file mode 100644 index 0000000000..3e4d3852c0 --- /dev/null +++ b/projects/amdsmi/.github/workflows/auto-label.yml @@ -0,0 +1,319 @@ +name: Auto Label PRs + +on: + pull_request: + types: [opened, synchronize, reopened, closed] + workflow_run: + workflows: ["ABI Compliance Check"] + types: [completed] + +jobs: + apply-labels: + runs-on: AMD-ROCm-Internal-dev1 + permissions: + pull-requests: write + actions: read + contents: read + steps: + - name: Add/Remove labels based on branch names and ABI results + uses: actions/github-script@v6 + with: + script: | + const pr = context.payload.pull_request; + let prNumber, headSha, baseBranch, headBranch; + + // Handle different event types + if (context.eventName === 'pull_request') { + prNumber = pr.number; + headSha = pr.head.sha; + baseBranch = pr.base.ref; + headBranch = pr.head.ref; + } else if (context.eventName === 'workflow_run') { + // Find the associated PR for workflow_run events + const workflowRun = context.payload.workflow_run; + console.log(`Workflow run completed: ${workflowRun.name} with conclusion: ${workflowRun.conclusion}`); + + if (workflowRun.event !== 'pull_request') { + console.log('Workflow run was not triggered by a pull request, skipping'); + return; + } + + const prs = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + head: `${context.repo.owner}:${workflowRun.head_branch}` + }); + + const associatedPr = prs.data.find(p => p.head.sha === workflowRun.head_sha); + + if (!associatedPr) { + console.log('No associated PR found for this workflow run'); + return; + } + + prNumber = associatedPr.number; + headSha = associatedPr.head.sha; + baseBranch = associatedPr.base.ref; + headBranch = associatedPr.head.ref; + } else { + console.log('Unsupported event type'); + return; + } + + let labelsApplied = false; + + // Debug information + console.log(`Processing PR #${prNumber}: Head: ${headBranch}, Base: ${baseBranch}`); + + // Get current PR data to check existing labels + const { data: currentPr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber + }); + const existingLabels = currentPr.labels.map(label => label.name); + + // Condition 1: PR targeting amd-mainline + if (baseBranch === 'amd-mainline' && context.eventName === 'pull_request') { + const labelToAdd = 'Merge amd-mainline'; + try { + if (!existingLabels.includes(labelToAdd)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelToAdd] + }); + console.log(`Added label "${labelToAdd}" to PR #${prNumber}`); + labelsApplied = true; + } + } catch (error) { + console.error(`Error adding label "${labelToAdd}": ${error.message}`); + } + } + + // Condition 2: Cherry-pick based on head branch name or release target + if (context.eventName === 'pull_request') { + const isCherryPickHead = /cherry.*pick/i.test(headBranch); + const isReleaseTargetBase = baseBranch.startsWith('release/'); + + if (isCherryPickHead || isReleaseTargetBase) { + const labelToAdd = 'cherry-pick'; + try { + if (!existingLabels.includes(labelToAdd)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelToAdd] + }); + console.log(`Added label "${labelToAdd}" to PR #${prNumber}`); + labelsApplied = true; + } else { + console.log(`Label "${labelToAdd}" already exists on PR #${prNumber}`); + } + } catch (error) { + console.error(`Error adding label "${labelToAdd}": ${error.message}`); + } + } + } + + // ABI BREAKAGE LOGIC: Check on both workflow_run AND pull_request events + let shouldCheckABI = false; + let hasMajorAbiBreakage = false; + let hasMinorAbiBreakage = false; + + if (context.eventName === 'workflow_run') { + // Handle workflow_run events (existing logic) + const workflowRun = context.payload.workflow_run; + + if (workflowRun.name === 'ABI Compliance Check') { + shouldCheckABI = true; + console.log(`ABI Compliance Check completed with conclusion: ${workflowRun.conclusion}`); + + try { + const { data: jobs } = await github.rest.actions.listJobsForWorkflowRun({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: workflowRun.id + }); + + // Check job conclusions for ABI breakage + for (const job of jobs.jobs) { + console.log(`Job: ${job.name}, Conclusion: ${job.conclusion}`); + + if (job.name.includes('Major ABI') && job.conclusion === 'failure') { + hasMajorAbiBreakage = true; + console.log('Major ABI breakage detected from job failure'); + } + + if (job.name.includes('Minor ABI') && job.conclusion === 'failure') { + hasMinorAbiBreakage = true; + console.log('Minor ABI breakage detected from job failure'); + } + } + + // If workflow succeeded, no ABI breakage + if (workflowRun.conclusion === 'success') { + console.log('ABI Compliance Check succeeded - no ABI breakage'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } + + } catch (error) { + console.log(`Could not fetch job details: ${error.message}`); + return; + } + } + } else if (context.eventName === 'pull_request') { + // NEW: Check if amdsmi.h has been reverted on PR events + const hasAbiLabels = existingLabels.includes('MAJOR ABI BREAKAGE') || existingLabels.includes('MINOR ABI BREAKAGE'); + + if (hasAbiLabels) { + console.log('PR has ABI labels, checking if amdsmi.h changes were reverted...'); + shouldCheckABI = true; + + try { + // Get the diff for amdsmi.h between base and head + const { data: comparison } = await github.rest.repos.compareCommits({ + owner: context.repo.owner, + repo: context.repo.repo, + base: currentPr.base.sha, + head: currentPr.head.sha + }); + + // Check if amdsmi.h has any changes + const amdsmiFile = comparison.files?.find(file => file.filename === 'include/amd_smi/amdsmi.h'); + + if (!amdsmiFile) { + console.log('No changes to amdsmi.h found in this PR - removing ABI labels'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } else if (amdsmiFile.changes === 0) { + console.log('amdsmi.h file exists but has no changes - removing ABI labels'); + hasMajorAbiBreakage = false; + hasMinorAbiBreakage = false; + } else { + console.log(`amdsmi.h has ${amdsmiFile.changes} changes - keeping existing ABI labels`); + // Keep existing labels since we can't determine ABI status without running the check + hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + } + + } catch (error) { + console.log(`Error checking file changes: ${error.message}`); + // If we can't check, preserve existing labels + hasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + hasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + } + } + } + + // Manage ABI breakage labels (only if we determined ABI status) + if (shouldCheckABI) { + const abiLabels = { + 'MAJOR ABI BREAKAGE': hasMajorAbiBreakage, + 'MINOR ABI BREAKAGE': hasMinorAbiBreakage + }; + + const wasMajorAbiBreakage = existingLabels.includes('MAJOR ABI BREAKAGE'); + const wasMinorAbiBreakage = existingLabels.includes('MINOR ABI BREAKAGE'); + + for (const [labelName, shouldHaveLabel] of Object.entries(abiLabels)) { + const hasLabel = existingLabels.includes(labelName); + + if (shouldHaveLabel && !hasLabel) { + // Add label + try { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + labels: [labelName] + }); + console.log(`✅ Added label "${labelName}" to PR #${prNumber}`); + labelsApplied = true; + } catch (error) { + console.error(`❌ Error adding label "${labelName}": ${error.message}`); + } + } else if (!shouldHaveLabel && hasLabel) { + // Remove label + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + name: labelName + }); + console.log(`🗑️ Removed label "${labelName}" from PR #${prNumber}`); + labelsApplied = true; + } catch (error) { + console.error(`❌ Error removing label "${labelName}": ${error.message}`); + } + } + } + + // Add comments when ABI issues are detected or resolved + if (context.eventName === 'workflow_run') { + // Only add comments for workflow_run events (actual ABI check results) + if (hasMajorAbiBreakage && !wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '⚠️ **MAJOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report and fix any breaking changes.' + }); + } + + if (hasMinorAbiBreakage && !wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '⚠️ **MINOR ABI BREAKAGE detected** in the latest ABI compliance check. Please review the ABI compliance report for details.' + }); + } + + if (!hasMajorAbiBreakage && wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MAJOR ABI BREAKAGE resolved** - ABI compliance check is now passing!' + }); + } + + if (!hasMinorAbiBreakage && wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MINOR ABI BREAKAGE resolved** - ABI compliance check is now passing!' + }); + } + } else if (context.eventName === 'pull_request') { + // Add comment when labels are removed due to file reversion + if (!hasMajorAbiBreakage && wasMajorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MAJOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.' + }); + } + + if (!hasMinorAbiBreakage && wasMinorAbiBreakage) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body: '✅ **MINOR ABI BREAKAGE resolved** - `amdsmi.h` changes have been reverted.' + }); + } + } + } + + if (!labelsApplied && context.eventName === 'pull_request') { + console.log(`PR #${prNumber} did not match criteria for automatic labeling by this workflow.`); + } \ No newline at end of file diff --git a/projects/amdsmi/.github/workflows/cmake_format.yml b/projects/amdsmi/.github/workflows/cmake_format.yml new file mode 100644 index 0000000000..d51277e339 --- /dev/null +++ b/projects/amdsmi/.github/workflows/cmake_format.yml @@ -0,0 +1,99 @@ +# caution: most of this file was written using Claude 3.7 Sonnet +name: CMake Format Check + +on: + push: + branches: [ amd-staging ] + paths: + - '**/*.cmake' + - '**/CMakeLists.txt' + - '**/*.cmake.in' + pull_request: + branches: [ amd-staging ] + paths: + - '**/*.cmake' + - '**/CMakeLists.txt' + - '**/*.cmake.in' + workflow_dispatch: # Allows manual triggering + +defaults: + run: + shell: bash + +jobs: + check-cmake-format: + name: Check CMake files formatting + runs-on: self-hosted + container: catthehacker/ubuntu:act-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for better diff context + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install cmake-format + run: | + python -m pip install --upgrade pip + pip install cmake-format==0.6.13 + + - name: Check CMake formatting + id: check-format + run: | + echo "::group::Finding CMake files" + FILES=$(find . -type f \( -name "CMakeLists.txt" -o -name "*.cmake" -o -name "*.cmake.in" \) \ + -not -path "*/esmi_ib_library/*" \ + -not -path "*/\.*" \ + -not -path "*/build/*") + echo "Found $(echo "$FILES" | wc -l) CMake files to check" + echo "::endgroup::" + + # Create an array to store failed files + declare -a failed_files + + # Check if files are formatted correctly + for file in $FILES; do + echo "Checking $file..." + if ! cmake-format --check "$file"; then + failed_files+=("$file") + echo "::error file=$file::File needs formatting" + fi + done + + # Generate report and exit with error if any files failed + if [ ${#failed_files[@]} -ne 0 ]; then + echo "Failed files: ${failed_files[*]}" + echo "FAILED_FILES=${failed_files[*]}" >> $GITHUB_ENV + exit 1 + else + echo "All CMake files are formatted correctly!" + fi + + - name: Generate diff for failed files + if: failure() && env.FAILED_FILES != '' + run: | + echo "## CMake Format Check Failed" >> $GITHUB_STEP_SUMMARY + echo "The following files need formatting:" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + for file in ${FAILED_FILES}; do + echo "### $file" >> $GITHUB_STEP_SUMMARY + done + + cat << 'EOF' >> $GITHUB_STEP_SUMMARY + ### How to fix + Run this command locally to fix formatting issues: + ```bash + # Install cmake-format + pip install cmake-format==0.6.13 + + # Format files + cmake-format -i + ``` + EOF diff --git a/projects/amdsmi/.github/workflows/codeql.yml b/projects/amdsmi/.github/workflows/codeql.yml new file mode 100644 index 0000000000..e0399dd8e1 --- /dev/null +++ b/projects/amdsmi/.github/workflows/codeql.yml @@ -0,0 +1,92 @@ +name: "CodeQL Advanced" + +on: + pull_request: + branches: + - amd-staging + push: + branches: + - amd-staging + schedule: + - cron: '34 18 * * 5' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: c-cpp + build-mode: manual + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12.6' + + - name: Install CMake + run: python3 -m pip install cmake + + - name: Install Virtualenv + run: python3 -m pip install virtualenv + + - name: Install g++ + run: sudo apt-get install -y g++ + + - name: Install libdrm + run: sudo apt-get install -y libdrm-dev + + - name: Install DOxygen + run: sudo apt-get install -y doxygen + + - name: Install LaTeX + run: sudo apt-get install -y texlive + + - name: Clean old ROCm directories + run: | + sudo rm -rf /opt/rocm + sudo rm -rf /opt/rocm-* + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + queries: security-extended + + - name: Create build directory + run: mkdir -p build + + - name: Build AMD SMI Library + run: | + cd build + cmake .. + make -j $(nproc) + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/projects/amdsmi/.github/workflows/generate-docs.yml b/projects/amdsmi/.github/workflows/generate-docs.yml new file mode 100644 index 0000000000..9e2ee544b4 --- /dev/null +++ b/projects/amdsmi/.github/workflows/generate-docs.yml @@ -0,0 +1,83 @@ +name: Generate Documentation + +on: + pull_request: + branches: [amd-staging, amd-mainline, release/rocm-rel-*] + push: + branches: [amd-staging, amd-mainline, release/rocm-rel-*] + workflow_dispatch: + +permissions: + contents: read + +env: + DEBIAN_FRONTEND: noninteractive + DEBCONF_NONINTERACTIVE_SEEN: true + BUILD_TYPE: Release + +jobs: + generate-docs: + name: Generate Documentation + runs-on: AMD-ROCm-Internal-dev1 + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Get branch name for artifact naming + id: get_branch_info + run: | + BRANCH_NAME="" + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + BRANCH_NAME="${{ github.head_ref }}" + else + BRANCH_NAME="${{ github.ref_name }}" + fi + SANITIZED_NAME=$(echo "$BRANCH_NAME" | sed -e 's|/|-|g' -e 's|[^a-zA-Z0-9._-]||g' -e 's|^-*||' -e 's|-*$||') + if [[ -z "$SANITIZED_NAME" ]]; then + SANITIZED_NAME="docs-$(date +%s)" + fi + echo "sanitized_name=${SANITIZED_NAME}" >> $GITHUB_OUTPUT + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y doxygen graphviz + + - name: Set Up Python Environment + run: | + python3 -m pip install --upgrade pip + python3 -m pip install -r docs/sphinx/requirements.txt + + - name: Build Documentation + run: | + if [ ! -e "docs/.git" ]; then + if [ -d ".git" ]; then + ln -s ../.git docs/.git + fi + fi + cd docs + python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html + + - name: Upload Documentation + uses: actions/upload-artifact@v4 + with: + name: documentation-${{ steps.get_branch_info.outputs.sanitized_name }} + path: docs/_build/html/ + + - name: Generate Job Summary + run: | + cat >> $GITHUB_STEP_SUMMARY << 'EOF' + # 📚 Documentation Generated Successfully! + + ## 🚀 Quick Start + + 1. **📥 Download** the artifact `documentation-${{ steps.get_branch_info.outputs.sanitized_name }}` + 2. **📂 Extract** the ZIP file + 3. **🖱️ Double-click** `index.html` + 4. **✅ Done!** Documentation opens with full formatting in your browser + EOF diff --git a/projects/amdsmi/.github/workflows/ghemu2amdgerrit.yml b/projects/amdsmi/.github/workflows/ghemu2amdgerrit.yml new file mode 100644 index 0000000000..cd1cbe28dd --- /dev/null +++ b/projects/amdsmi/.github/workflows/ghemu2amdgerrit.yml @@ -0,0 +1,83 @@ +name: GitHub to Gerrit Mirror + +run-name: "Mirror to Gerrit: ${{ github.event.ref || inputs.branch }} ${{ github.event.after }}" + +on: + workflow_dispatch: + inputs: + branch: + description: 'Branch to mirror (amd-staging or amd-mainline)' + required: true + default: 'amd-staging' + type: choice + options: + - amd-staging + - amd-mainline + pull_request: + branches: + - amd-staging + - amd-mainline + types: [closed] + +env: + GERRIT_SERVER: "gerrit-git.amd.com" + GERRIT_PROJECT: "SYS-MGMT/ec/amd-smi" + GERRIT_USER: "z1_runner" + GERRIT_PORT: "29418" + +jobs: + Setup: + if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true + runs-on: banff-sc-cx43-29 + steps: + + - name: Fix workspace permissions + run: | + sudo chown -R $(id -u):$(id -g) ${{ github.workspace }} + sudo chmod -R u+rwX ${{ github.workspace }} + + - name: Check out repository code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Create SSH key + run: | + mkdir -p ~/.ssh + chmod 700 ~/.ssh + touch ~/.ssh/known_hosts + touch ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + printf "%s" "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + ssh-keyscan -p ${{ env.GERRIT_PORT }} ${{ env.GERRIT_SERVER }} >> ~/.ssh/known_hosts + + - name: Debug SSH setup + run: | + ls -la ~/.ssh + ssh -p ${{ env.GERRIT_PORT }} ${{ env.GERRIT_USER }}@${{ env.GERRIT_SERVER }} || true + + - name: Set Gerrit remote + run: | + cd ${{ github.workspace }} + if git remote | grep -q "gerrit" + then + git remote set-url gerrit ssh://${{ env.GERRIT_USER }}@${{ env.GERRIT_SERVER }}:${{ env.GERRIT_PORT }}/${{ env.GERRIT_PROJECT }} + else + git remote add gerrit ssh://${{ env.GERRIT_USER }}@${{ env.GERRIT_SERVER }}:${{ env.GERRIT_PORT }}/${{ env.GERRIT_PROJECT }} + fi + + - name: Set committer identity for Gerrit + run: | + git config user.name "z1_runner" + git config user.email "z1_runner@amd.com" + + - name: Fetch selected branch + run: | + BRANCH="${{ github.event.pull_request.base.ref || inputs.branch }}" + git fetch origin ${BRANCH}:refs/remotes/origin/${BRANCH} + git checkout ${BRANCH} + + - name: Mirror selected branch to Gerrit + run: | + BRANCH="${{ github.event.pull_request.base.ref || inputs.branch }}" + git push gerrit refs/heads/${BRANCH}:refs/heads/${BRANCH} \ No newline at end of file diff --git a/projects/amdsmi/.github/workflows/kws_caller.yml b/projects/amdsmi/.github/workflows/kws_caller.yml new file mode 100644 index 0000000000..c0f4f26807 --- /dev/null +++ b/projects/amdsmi/.github/workflows/kws_caller.yml @@ -0,0 +1,15 @@ +name: Rocm Validation Suite KWS +on: + push: + branches: [amd-staging, amd-mainline] + pull_request: + types: [opened, synchronize, reopened] + workflow_dispatch: +jobs: + kws: + if: ${{ github.event_name == 'pull_request' }} + uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/kws.yml@mainline + secrets: inherit + with: + pr_number: ${{github.event.pull_request.number}} + base_branch: ${{github.base_ref}} diff --git a/projects/amdsmi/.github/workflows/rocm_ci_caller.yml b/projects/amdsmi/.github/workflows/rocm_ci_caller.yml new file mode 100644 index 0000000000..9643cdf732 --- /dev/null +++ b/projects/amdsmi/.github/workflows/rocm_ci_caller.yml @@ -0,0 +1,25 @@ +name: ROCm CI Caller +on: + pull_request: + branches: [amd-staging, release/rocm-rel-*, amd-mainline] + types: [opened, reopened, synchronize] + push: + branches: [amd-mainline] + workflow_dispatch: + issue_comment: + types: [created] + +jobs: + call-workflow: + if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest'))) + uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline + secrets: inherit + with: + input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}} + input_pr_num: ${{github.event_name == 'pull_request' && github.event.pull_request.number || (github.event_name == 'issue_comment' && github.event.issue.number) || 0}} + input_pr_url: ${{github.event_name == 'pull_request' && github.event.pull_request.html_url || (github.event_name == 'issue_comment' && github.event.issue.pull_request.html_url) || ''}} + input_pr_title: ${{github.event_name == 'pull_request' && github.event.pull_request.title || (github.event_name == 'issue_comment' && github.event.issue.pull_request.title) || ''}} + repository_name: ${{ github.repository }} + base_ref: ${{github.event_name == 'pull_request' && github.event.pull_request.base.ref || (github.event_name == 'issue_comment' && github.event.issue.pull_request.base.ref) || github.ref}} + trigger_event_type: ${{ github.event_name }} + comment_text: ${{ github.event_name == 'issue_comment' && github.event.comment.body || '' }} diff --git a/projects/amdsmi/.gitignore b/projects/amdsmi/.gitignore new file mode 100644 index 0000000000..af09e3a112 --- /dev/null +++ b/projects/amdsmi/.gitignore @@ -0,0 +1,44 @@ +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. + +# VisualStudioCode +.vscode/ + +# build directories generated by cmake +build/ +cmake/build/ +.cache/ + +# build artifacts +oam/include/oam/oamConfig.h +python_smi_tools/rsmiBindings.py +include/amd_smi/amd_smi64Config.h +rocm_smi/include/rocm_smi/rocm_smi64Config.h +docs/*.pdf +goamdsmi_shim/include/goamdsmi_shimConfig.h +goamdsmi_shim/include/goamdsmi_shim64Config.h + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.egg-* + +# documentation artifacts +_toc.yml +_build/ +_doxygen/ +docBin/ + +# Simulated SYSFS - for early development or debug +device/ + +# misc +esmi_ib_library/ + +# do NOT ignore these files +!.clang-format +!.clang-tidy +!.clangd +!.cmake-format +!.pre-commit-config.yaml diff --git a/projects/amdsmi/.pre-commit-config.yaml b/projects/amdsmi/.pre-commit-config.yaml new file mode 100644 index 0000000000..b206f091d4 --- /dev/null +++ b/projects/amdsmi/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +# - How to use: +# python3 -m pip install pre-commit +# pre-commit install --install hooks +# Upon a new commit - the hooks should automagically run +# +# - How to skip: +# git commit --no-verify +# or +# SKIP=clang-format-docker git commit +# SKIP=cpplint-docker git commit + +fail_fast: false +repos: + # For portability I decided to use Docker containers + - repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint + rev: 0.0.3 + hooks: + - id: clang-format-docker + - id: cpplint-docker + - repo: https://github.com/cheshirekow/cmake-format-precommit + rev: v0.6.13 + hooks: + - id: cmake-format + # Below is a local way of running formatters and linters + # NOTE: clang-tidy is not used in the above tests + # - repo: https://github.com/pocc/pre-commit-hooks + # rev: v1.3.5 + # hooks: + # - id: clang-format + # args: [--no-diff, -i] + # - id: clang-tidy + # args: [-p=build, --quiet] + # - id: cpplint + # args: [--verbose=5] diff --git a/projects/amdsmi/.readthedocs.yaml b/projects/amdsmi/.readthedocs.yaml new file mode 100644 index 0000000000..935cbdb7ee --- /dev/null +++ b/projects/amdsmi/.readthedocs.yaml @@ -0,0 +1,18 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-24.04 + tools: + python: "3.12" + +sphinx: + configuration: docs/conf.py + +formats: [htmlzip, pdf] + +python: + install: + - requirements: docs/sphinx/requirements.txt diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md new file mode 100644 index 0000000000..a085d079aa --- /dev/null +++ b/projects/amdsmi/CHANGELOG.md @@ -0,0 +1,3661 @@ +# Changelog for AMD SMI Library + +Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/projects/amdsmi](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). + +***All information listed below is for reference and subject to change.*** + +## amd_smi_lib for ROCm 7.2.0 + +### Added + +- **Added the following C API's to amdsmi_interface.py**. + - amdsmi_get_cpu_handle() + - amdsmi_get_esmi_err_msg() + - amdsmi_get_gpu_event_notification() + - amdsmi_get_processor_count_from_handles() + - amdsmi_get_processor_handles_by_type() + - amdsmi_gpu_validate_ras_eeprom() + - amdsmi_init_gpu_event_notification() + - amdsmi_set_gpu_event_notification_mask() + - amdsmi_stop_gpu_event_notification() + - amdsmi_get_gpu_busy_percent() + +- **Added additional return value to API amdsmi_get_xgmi_plpd()**. + - The entry `policies` is added to the end of the dictionary to match API definition. + - The entry `plpds` is marked for deprecation as it has the same information as `policies`. + +- **Added pcie levels to `amd-smi static --bus` command**. + - The static --bus option has been updated to include the range of pcie levels that one may set a device to. + - Levels are a 2-tuple composed of the PCIE speed and bandwidth. + + ```console + $ amd-smi static --bus + GPU: 0 + BUS: + BDF: 0000:43:00.0 + MAX_PCIE_WIDTH: 16 + MAX_PCIE_SPEED: 16 GT/s + PCIE_LEVELS: + 0: (2.5 GT/s, 1) + 1: (5.0 GT/s, 4) + 2: (16.0 GT/s, 16) + PCIE_INTERFACE_VERSION: Gen 4 + SLOT_TYPE: CEM + ``` + +- **Added evicted_time metric for kfd processes**. + - Time that queues are evicted on a GPU in milliseconds + - Added to CLI in `amd-smi monitor -q` and `amd-smi process` + - Added to C API and Python API: + - amdsmi_get_gpu_process_list() + - amdsmi_get_gpu_compute_process_info() + - amdsmi_get_gpu_compute_process_info_by_pid() + +- **Added new VRAM types to `amdsmi_vram_type_t`**. + - `amd-smi static --vram` & `amdsmi_get_gpu_vram_info()` now support the following types: + - DDR5, LPDDR4, LPDDR5, and HBM3E + +- **Added support for PPT1 power limit information**. + - Support has been added for querying and setting the PPT (Package Power Tracking) limits + - There are two PPT limits, PPT0 has lower limit and tracks a filtered version of the input power and PPT1 has higher limit but tracks the raw input power. This is to catch spikes in the raw data. + - New API added: + - amdsmi_get_supported_power_cap(): Returns which power cap types are supported on the device (PPT0, PPT1). This will allow users to know which power cap types they can get/set. + - Original APIs remain the same but now can get/set both PPT0 and PPT1 limits (on supported hardware): + - amdsmi_get_power_cap_info() + - amdsmi_set_power_cap() + - See the Changed section for changes made to the `set` and `static` commands regarding support for PPT1. + +### Changed + +- **The `amd-smi` command now shows hsmp rather than amd_hsmp**. + - The hsmp driver version can be shown without the amdgpu version using `amd-smi version -c` + + ```console + $ amd-smi version + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 | hsmp version: 2.2 + + $ amd-smi version -c + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | hsmp version: 2.2 + ... + ``` + +- **`amd-smi set --power-cap` now requires sepcification of the power cap type**. + - Command now takes the form: `amd-smi set --power-cap ` + - Acceptable power cap types are "ppt0" and "ppt1" + + ```console + $ sudo amd-smi set --power-cap ppt1 1150 + GPU: 0 + POWERCAP: Successfully set ppt1 power cap to 1150W + ... + ``` + +- **`amd-smi reset --power-cap` will attempt to reset both power caps**. + - When using the reset command, both PPT0 and PPT1 power caps will be reset to their default values. If a device only has PPT0, then only PPT0 will be reset. + Ex. + ```console + $ sudo amd-smi reset --power-cap ppt1 1150 + GPU: 0 + POWERCAP: + PPT0: Successfully reset power cap to 203W + PPT1: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap + ... + ``` + +- **`amd-smi static --limit` now has a PPT1 section when PPT1 is available**. + - The static --limit command has been updated to include PPT1 power limit information when available on the device. + ```console + $ amd-smi static --limit + GPU: 0 + LIMIT: + PPT0: + MAX_POWER_LIMIT: 1000 + MIN_POWER_LIMIT: 0 + SOCKET_POWER_LIMIT: 1000 + PPT1: + MAX_POWER_LIMIT: 1300 + MIN_POWER_LIMIT: 1100 + SOCKET_POWER_LIMIT: 1250 + SLOWDOWN_EDGE_TEMPERATURE: N/A + ... + ``` + - JSON and CSV formats are updated to reflect this change as well. + Ex. + ```console + $ amd-smi static --limit --json + { + "gpu_data": [ + { + "gpu": 0, + "limit": { + "ppt0": { + "max_power_limit": { + "value": 203, + "unit": "W" + }, + "min_power_limit": { + "value": 0, + "unit": "W" + }, + "socket_power_limit": { + "value": 100, + "unit": "W" + } + }, + "ppt1": { + "max_power_limit": "N/A", + "min_power_limit": "N/A", + "socket_power_limit": "N/A" + }, + ... + } + }, + ... + ``` + + ```console + $ amd-smi static --limit --csv + gpu,ppt0_max_power_limit,ppt0_min_power_limit,ppt0_socket_power_limit,ppt1_max_power_limit,ppt1_min_power_limit,ppt1_socket_power_limit,slowdown_edge_temperature,slowdown_hotspot_temperature,slowdown_vram_temperature,shutdown_edge_temperature,shutdown_hotspot_temperature,shutdown_vram_temperature + 0,203,0,100,N/A,N/A,N/A,100,110,100,105,115,105 + 1,213,0,100,N/A,N/A,N/A,109,110,100,114,115,105 + ``` + +### Removed + +- N/A + +### Optimized + +- N/A + +### Resolved Issues + +- **Fixed an issue where amdsmi_get_gpu_od_volt_info() returned a reference to a python object**. + - The returned dictionary was changed to return values in all fields + +### Upcoming Changes + +- N/A + +### Known Issues + +- N/A + +## amd_smi_lib for ROCm 7.1.0 + +### Added + +- **Added `GPU LINK PORT STATUS` table to `amd-smi xgmi` command**. + - The `amd-smi xgmi -s` or `amd-smi xgmi --source-status` will show `GPU LINK PORT STATUS` table. + +- **Added `amdsmi_get_gpu_revision()` to Python API** + - This function retrieves the GPU revision ID. Available in `amdsmi_interface.py` as `amdsmi_get_gpu_revision()`. + +- **Added gpuboard and baseboard temperatures to `amd-smi metric` command**. + - The metric command has been updated with various gpuboard and baseboard temperatures in degrees Celsius. Users can access these + values through the `-G/--gpuboard` or `-b/--baseboard` options or obtain all of them as normal using the `amd-smi metric` command without + any options. If the hardware does not support gpuboard or baseboard temperatures, then the values will be hidden from the default `metric` view. + + ```console + $ amd-smi metric -b + GPU: 0 + BASEBOARD: + TEMPERATURE: + FIRST: 78 + UBB_FRONT: 55 + UBB_BACK: 49 + UBB_OAM7: 86 + UBB_IBC: 94 + UBB_UFPGA: 49 + UBB_OAM1: 78 + OAM_0_1_HSC: 54 + OAM_2_3_HSC: 32 + OAM_4_5_HSC: 14 + OAM_6_7_HSC: 85 + UBB_FPGA_0V72_VR: 43 + UBB_FPGA_3V3_VR: 41 + RETIMER_0_1_2_3_1V2_VR: 64 + RETIMER_4_5_6_7_1V2_VR: 56 + RETIMER_0_1_0V9_VR: 74 + RETIMER_4_5_0V9_VR: 34 + RETIMER_2_3_0V9_VR: 85 + RETIMER_6_7_0V9_VR: 92 + OAM_0_1_2_3_3V3_VR: 29 + OAM_4_5_6_7_3V3_VR: 13 + IBC_HSC: 41 + IBC: 43 + + $ amd-smi metric -G + GPU: 0 + GPUBOARD: + TEMPERATURE: + NODE_RETIMER_X: 43 + NODE_OAM_X_IBC: 24 + NODE_OAM_X_IBC_2: 56 + NODE_OAM_X_VDD18_VR: 34 + NODE_OAM_X_04_HBM_B_VR: 53 + NODE_OAM_X_04_HBM_D_VR: 47 + VR_FIRST: 58 + VDDCR_VDD1: 78 + VDDCR_VDD2: 35 + VDDCR_VDD3: 73 + VDDCR_SOC_A: 12 + VDDCR_SOC_C: 57 + VDDCR_SOCIO_A: 39 + VDDCR_SOCIO_C: 75 + VDD_085_HBM: 64 + VDDCR_11_HBM_B: 92 + VDDCR_11_HBM_D: 87 + VDD_USR: 46 + VDDIO_11_E32: 98 + + $ amd-smi metric + GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + ... + POWER: + SOCKET_POWER: 140 W + GFX_VOLTAGE: N/A + ... + CLOCK: + GFX_0: + CLK: 132 MHz + MIN_CLK: 500 MHz + ... + TEMPERATURE: + EDGE: N/A + HOTSPOT: 37 °C + ... + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + ... + GPUBOARD: + TEMPERATURE: + NODE_RETIMER_X: 43 + NODE_OAM_X_IBC: 24 + ... + BASEBOARD: + TEMPERATURE: + UBB_FPGA: 78 + UBB_FRONT: 55 + ... + ECC: + TOTAL_CORRECTABLE_COUNT: 0 + TOTAL_UNCORRECTABLE_COUNT: 0 + ... + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + ... + FAN: + SPEED: N/A + MAX: N/A + ... + VOLTAGE_CURVE: + POINT_0_FREQUENCY: N/A + POINT_0_VOLTAGE: N/A + ... + OVERDRIVE: N/A + MEM_OVERDRIVE: N/A + PERF_LEVEL: AMDSMI_DEV_PERF_LEVEL_AUTO + XGMI_ERR: N/A + VOLTAGE: + VDDBOARD: N/A + ENERGY: + TOTAL_ENERGY_CONSUMPTION: 14292727.274 J + MEM_USAGE: + TOTAL_VRAM: 196592 MB + USED_VRAM: 283 MB + ... + THROTTLE: + ACCUMULATION_COUNTER: 100936627 + PROCHOT_ACCUMULATED: 0 + ... + ``` + +### Changed + +- **Changed struct amdsmi_topology_nearest_t member processor_list**. + - Member size changed, processor_list[AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP] + +- **Changed `amd-smi reset --profile` behavior so that it would not also reset the performance level**. + - These settings are completely independent now so there is no longer any need to reset them together. Therefore the reset behavior for performance level has been removed from resetting the profile. Users can still reset the performance level as they normally would using `amd-smi reset --perf-determinism`. + +- **Setting power cap is now available in Linux Guest**. + - Users can now use `amd-smi set --power-cap` as usual but now in Linux Guest systems. + +- **Changed `amd-smi static --vbios` to `amd-smi static --ifwi`**. + - VBIOS naming is replaced with IFWI (Integrated Firmware Image) for improved clarity and consistency. + - Mi300+ series devices now use a new version format with enhanced build information. + - Legacy command `amd-smi static --vbios` remains functional for backward compatibility, but displays updated IFWI heading. + - The Python, C & Rust API for `amdsmi_get_gpu_vbios_version()` will now have a new field called `boot_firmware` which will return the legacy vbios version number which is also known as the Unified BootLoader Version (UBL version) + + **Legacy format (Non IFWI systems):** + + ```shell + $ amd-smi static --ifwi + GPU: 0 + IFWI: + NAME: XXXXXXXXXXXXXXXXXX + BUILD_DATE: 2020/10/29 13:30 + PART_NUMBER: 113-XXXXXXXX-111 + VERSION: 000.000.000.000.000000 (Legacy format) + ... + ``` + + **New format (Mi300+ series and IFWI systems):** + + ```shell + $ amd-smi static --ifwi + GPU: 0 + IFWI: + NAME: XXXXXXXXXXXXXXXXXX + BUILD_DATE: 2020/10/29 13:30 + PART_NUMBER: 113-XXXXXXXX-111 + VERSION: 00111111 (New format) + ... + ``` + +### Removed + +- N/A + +### Optimized + +- **Optimized the way `amd-smi process` validates which proccesses are running on a GPU**. + +- **Changed sourcing of BDF to from drm to kfd**. + - Non sudo privliged users were unable to see the BDF due to logical errors. + +### Resolved Issues + +- **Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`**. + - Fixed deletion calculation to use files_to_delete = len(folder_files) - file_limit for exact file count management + +- **Fixed event monitoring segfaults causing RDC to crash**. + - Adds mutex locking around access to device event notification file pointer + +- **Fixed an issue where using `amd-smi ras --folder ` was forcing the created folder's name to be lowercase**. + - This fix also allows all string input options to be case insensitive. + +- **Fixed certain output in `amd-smi monitor` when GPUs are partitioned**. + - Fixes amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, `amd-smi monitor -Vqt --file /tmp/test1`, etc. Those such commands will now be able to display as normal in partitioned GPU scenarios. + + +### Upcoming Changes + +- N/A + +### Known Issues + +- N/A + +## amd_smi_lib for ROCm 7.0.2 + +### Added + +- **Add bad_page_threshold_exceeded to `amd-smi static --ras`**. + - Added bad_page_threshold_exceeded field to `amd-smi static --ras`, which compares retired pages count against bad page threshold. This field displays True if retired pages exceed the threshold, False if within threshold, or N/A if threshold data is unavailable. Users should note that sudo is required to have the bad_page_threshold_exceeded field populated. + + ```shell + $ sudo amd-smi static --ras -g 0 + GPU: 0 + RAS: + EEPROM_VERSION: 0x30000 + BAD_PAGE_THRESHOLD: 128 + BAD_PAGE_THRESHOLD_EXCEEDED: False + PARITY_SCHEMA: DISABLED + SINGLE_BIT_SCHEMA: DISABLED + DOUBLE_BIT_SCHEMA: DISABLED + POISON_SCHEMA: ENABLED + ... + ``` + +### Changed + +- N/A + +### Removed + +- **Removed gpuboard and baseboard temperatures enums in amdsmi Python Library**. + - AmdSmiTemperatureType had issues with referencing the right attribute, so we removed the following duplicate enums: + - `AmdSmiTemperatureType.GPUBOARD_NODE_FIRST` + - `AmdSmiTemperatureType.GPUBOARD_VR_FIRST` + - `AmdSmiTemperatureType.BASEBOARD_FIRST` + +### Optimized + +- **Implemented reference counting to manage init and shutdown processes**. + - This allows multiple initializations and shutdowns of amdsmi. + +### Resolved issues + +- **Fixed `attribute error` in `amd-smi monitor` on Linux Guest systems where violations argument caused CLI to break**. + +- **Added KFD Fallback for process detection**. + - Some processes were not being detected by AMD SMI despite making use of KFD resources. This fix ensures that all KFD processes will be detected. + +- **Multiple CPER issues were fixed**. + - Fixed issue where we were unable to query for additional CPERs after 20 were generated on a single device. + - Fixed issue where RAS HBM CRC read was failing due to incorrect AFID value. + - Fixed issue where RAS injections were not always producing related CPERs. + +### Upcoming changes + +- N/A + +### Known issues + +- N/A + +## amd_smi_lib for ROCm 7.0.0 + +### Added + +- **Added restarting (reloading) AMD GPU driver to both CLI and API calls** + - Refer to [Separated driver reload from `amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()` and CLI (`sudo amd-smi set -M `)](#separate-driver-reload-anchor) section for more details. + +- **Added the Default command**. + - A default view has been added. The default view provides a snapshot of commonly requested information such as bdf, current partition mode, version information, and more. Users can access that information by simply typing `amd-smi` with no additional commands or arguments. Users may also obtain this information through laternate output formats such as json or csv by using the default command with the respective output format: `amd-smi default --json` or `amd-smi default --csv`. + +```console +$ amd-smi ++------------------------------------------------------------------------------+ +| AMD-SMI 26.0.0+eaa54ecc amdgpu version: 6.12.12 ROCm version: 7.0.0 | +| Platform: Linux Baremetal | +|-------------------------------------+----------------------------------------| +| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage | +| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage | +|=====================================+========================================| +| 0000:0c:00.0 AMD Instinct MI300X | 13 % 60 °C 0 734/750 W | +| 0 0 2 SPX/NPS1 | 98 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:22:00.0 AMD Instinct MI300X | 10 % 60 °C 0 652/750 W | +| 1 1 1 SPX/NPS1 | 83 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:38:00.0 AMD Instinct MI300X | 5 % 55 °C 0 376/750 W | +| 2 2 0 SPX/NPS1 | 34 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:5c:00.0 AMD Instinct MI300X | 2 % 57 °C 0 234/750 W | +| 3 3 3 SPX/NPS1 | 12 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:9f:00.0 AMD Instinct MI300X | 1 % 57 °C 0 219/750 W | +| 4 4 7 SPX/NPS1 | 11 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:af:00.0 AMD Instinct MI300X | 3 % 61 °C 0 295/750 W | +| 5 5 5 SPX/NPS1 | 23 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:bf:00.0 AMD Instinct MI300X | 5 % 58 °C 0 367/750 W | +| 6 6 4 SPX/NPS1 | 36 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:df:00.0 AMD Instinct MI300X | 6 % 62 °C 0 434/750 W | +| 7 7 6 SPX/NPS1 | 47 % N/A 4976/196592 MB | ++-------------------------------------+----------------------------------------+ ++------------------------------------------------------------------------------+ +| Processes: | +| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE CU % | +|==============================================================================| +| 0 2427396 rvs 2.0 MB 2.0 GB 2.5 GB 0.0 % | +| 1 2427396 rvs 2.0 MB 2.2 GB 2.6 GB 0.0 % | +| 2 2427396 rvs 2.0 MB 2.3 GB 2.7 GB 0.0 % | +| 3 2427396 rvs 2.0 MB 2.3 GB 2.7 GB 0.0 % | +| 4 2427396 rvs 2.0 MB 2.1 GB 2.5 GB 0.0 % | +| 5 2427396 rvs 2.0 MB 2.0 GB 2.2 GB 0.0 % | +| 6 2427396 rvs 2.0 MB 2.1 GB 2.4 GB 0.0 % | +| 7 2427396 rvs 2.0 MB 2.1 GB 2.5 GB 0.0 % | ++------------------------------------------------------------------------------+ +``` + +- **Added support for GPU metrics 1.8**. + - Added new fields for `amdsmi_gpu_xcp_metrics_t` including: + - Adding the following metrics to allow new calculations for violation status: + - Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts + - Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts + - Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks. + - Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics). + - Increasing available JPEG engines to 40. + Current ASICs may not support all 40. These will be indicated as `UINT16_MAX` or `N/A` in CLI. + +- **Added bad page threshold count**. + - Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count. + +- **Updated `amdsmi_get_gpu_asic_info` in `amdsmi.h`**. + - Added `subsystem_id` structure member. + +- **Added cpu model name for RDC**. + - Added new C and Python API `amdsmi_get_cpu_model_name` + - Not sourced from esmi library. + +- **Added `amdsmi_get_cpu_affinity_with_scope()`**. + +- **Added `socket power` to `amdsmi_get_power_info`** + - Previously the C API had the value in the `amdsmi_power_info` structure, but was unused + - Now we populate the value in both C & Python APIs + - The value is representative of the socket's power agnostic of the the GPU version. + +### Changed + + +- **Separated driver reload from `amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()` and CLI (`sudo amd-smi set -M `)** + - Providing new API (`amdsmi_gpu_driver_reload()`) and CLI (`sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) once user is ready to reload driver. We understand + the automatic reload could be at an inconvenient time. This is why we now provide this + functionality in separate API/CLI commands to use when the time is right. + - It is important to understand, the memory (NPS) partition change requires: + 1) Memory partition change request (`amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()`) or CLI (`sudo amd-smi set -M `) + 2) Driver reload (`amdsmi_gpu_driver_reload()` / `sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) \[\*\] + ***Driver reload requires all GPU activity on all devices to be stopped.*** + +- **Modified `amd-smi` CLI `monitor` and `metric` for violations**. + - Disabled `amd-smi monitor --violation` on guests. + - Modified `amd-smi metric -T/--throttle` to alias to `amd-smi metric -v/--violation`. + +- **Updated `amdsmi_get_clock_info` in `amdsmi_interface.py`**. + - The `clk_deep_sleep` field now returns the sleep integer value. + +- **The `amd-smi topology` command has been enabled for Guest environments**. + - `amd-smi topology` is now available in Guest environments. This includes full functionality so users can use the command just as they would in Bare Metal environments. + +- **Expanded Violation Status tracking for GPU metrics 1.8**. + - The driver will no longer be supporting existing single-value GFX Clk Below Host Limit fields (`acc_gfx_clk_below_host_limit`, `per_gfx_clk_below_host_limit`, `active_gfx_clk_below_host_limit`), they are now changed in favor of new per-XCP/XCC arrays. + - Added new fields to `amdsmi_violation_status_t` and related interfaces for enhanced violation breakdown: + - Per-XCP/XCC accumulators and status for: + - GFX Clock Below Host Limit (Power, Thermal, and Total) + - Low Utilization + - Added 2D arrays to track per-XCP/XCC accumulators, percentage, and active status: + - `acc_gfx_clk_below_host_limit_pwr`, `acc_gfx_clk_below_host_limit_thm`, `acc_gfx_clk_below_host_limit_total` + - `per_gfx_clk_below_host_limit_pwr`, `per_gfx_clk_below_host_limit_thm`, `per_gfx_clk_below_host_limit_total` + - `active_gfx_clk_below_host_limit_pwr`, `active_gfx_clk_below_host_limit_thm`, `active_gfx_clk_below_host_limit_total` + - `acc_low_utilization`, `per_low_utilization`, `active_low_utilization` + - Python API and CLI now report these expanded fields. + - Example outputs: + + ```console + $ amd-smi monitor -V + GPU XCP PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL GFXCLK_PVIOL GFXCLK_TVIOL GFXCLK_TOTALVIOL LOW_UTILVIOL + 0 0 0 % 0 % False 0 % 0 % 0 % N/A [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] [100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %] [100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %] + 1 0 0 % 0 % False 0 % 0 % 0 % N/A [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] [100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %] [100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %, 100 %] + ... + ``` + + ```console + $ sudo amd-smi set -C DPX > /dev/null + + $ amd-smi monitor -V + GPU XCP PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL GFXCLK_PVIOL GFXCLK_TVIOL GFXCLK_TOTALVIOL LOW_UTILVIOL + 0 0 0 % 0 % False 0 % 0 % 0 % N/A [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + 0 1 N/A N/A N/A N/A N/A N/A N/A [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + 1 1 N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A N/A + 2 0 0 % 0 % False 0 % 0 % 0 % N/A [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + 2 1 N/A N/A N/A N/A N/A N/A N/A [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + ... + ``` + + ```console + $ amd-smi metric -v -g 0 + GPU: 0 + THROTTLE: + ACCUMULATION_COUNTER: 8213780 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 2 + SOCKET_THERMAL_ACCUMULATED: 0 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: N/A + GFX_CLK_BELOW_HOST_LIMIT_POWER_ACCUMULATED: + XCP_0: [0, 0, 0, 0, N/A, N/A, N/A, N/A] + XCP_1: [0, 0, 0, 0, N/A, N/A, N/A, N/A] + GFX_CLK_BELOW_HOST_LIMIT_THERMAL_ACCUMULATED: + XCP_0: [0, 0, 0, 0, N/A, N/A, N/A, N/A] + XCP_1: [0, 0, 0, 0, N/A, N/A, N/A, N/A] + TOTAL_GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: + XCP_0: [8213744, 8213743, 8213742, 8213743, N/A, N/A, N/A, N/A] + XCP_1: [8213744, 8213743, 8213744, 8213744, N/A, N/A, N/A, N/A] + LOW_UTILIZATION_ACCUMULATED: + XCP_0: [8213744, 8213743, 8213742, 8213743, N/A, N/A, N/A, N/A] + XCP_1: [8213744, 8213743, 8213744, 8213744, N/A, N/A, N/A, N/A] + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: N/A + GFX_CLK_BELOW_HOST_LIMIT_POWER_VIOLATION_STATUS: + XCP_0: [NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, N/A, N/A, N/A, N/A] + XCP_1: [NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, N/A, N/A, N/A, N/A] + GFX_CLK_BELOW_HOST_LIMIT_THERMAL_VIOLATION_STATUS: + XCP_0: [NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, N/A, N/A, N/A, N/A] + XCP_1: [NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, NOT ACTIVE, N/A, N/A, N/A, N/A] + TOTAL_GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: + XCP_0: [ACTIVE, ACTIVE, ACTIVE, ACTIVE, N/A, N/A, N/A, N/A] + XCP_1: [ACTIVE, ACTIVE, ACTIVE, ACTIVE, N/A, N/A, N/A, N/A] + LOW_UTILIZATION_VIOLATION_STATUS: + XCP_0: [ACTIVE, ACTIVE, ACTIVE, ACTIVE, N/A, N/A, N/A, N/A] + XCP_1: [ACTIVE, ACTIVE, ACTIVE, ACTIVE, N/A, N/A, N/A, N/A] + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: N/A + GFX_CLK_BELOW_HOST_LIMIT_POWER_VIOLATION_ACTIVITY: + XCP_0: [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] + GFX_CLK_BELOW_HOST_LIMIT_THERMAL_VIOLATION_ACTIVITY: + XCP_0: [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A] + TOTAL_GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: + XCP_0: [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + XCP_1: [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + LOW_UTILIZATION_VIOLATION_ACTIVITY: + XCP_0: [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + XCP_1: [100 %, 100 %, 100 %, 100 %, N/A, N/A, N/A, N/A] + ``` + +- **The char arrays in the following structures have been changed**. + - `amdsmi_vbios_info_t` member `build_date` changed from `AMDSMI_MAX_DATE_LENGTH` to `AMDSMI_MAX_STRING_LENGTH`. + - `amdsmi_dpm_policy_entry_t` member `policy_description` changed from `AMDSMI_MAX_NAME` to `AMDSMI_MAX_STRING_LENGTH`. + - `amdsmi_name_value_t` member `name` changed from `AMDSMI_MAX_NAME` to `AMDSMI_MAX_STRING_LENGTH`. + +- **Added new event notification types to `amdsmi_evt_notification_type_t`**. + The following values were added to the `amdsmi_evt_notification_type_t` enum: + - `AMDSMI_EVT_NOTIF_EVENT_MIGRATE_START` + - `AMDSMI_EVT_NOTIF_EVENT_MIGRATE_END` + - `AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_START` + - `AMDSMI_EVT_NOTIF_EVENT_PAGE_FAULT_END` + - `AMDSMI_EVT_NOTIF_EVENT_QUEUE_EVICTION` + - `AMDSMI_EVT_NOTIF_EVENT_QUEUE_RESTORE` + - `AMDSMI_EVT_NOTIF_EVENT_UNMAP_FROM_GPU` + - `AMDSMI_EVT_NOTIF_PROCESS_START` + - `AMDSMI_EVT_NOTIF_PROCESS_END` + +- **Added Power Cap to `amd-smi monitor`**. + - `amd-smi monitor -p` will display the power cap along with power. + + ```console + $ amd-smi monitor -p + GPU POWER PWR_CAP + 0 148 W 750 W + 1 156 W 750 W + 2 153 W 750 W + ... + ``` + +- **Updated `amdsmi_bdf_t` in `amdsmi.h`**. + - The `amdsmi_bdf_t` union was changed to have an identical unnamed struct for backwards compatiblity + +- **Updated `amdsmi_get_temp_metric` and `amdsmi_temperature_type_t` with new values**. + - New values have added to `amdsmi_temperature_type_t` representing various baseboard and gpuboard temperature measures. + - `amdsmi_get_temp_metric` API has also been updated to be able to take in and return the respective values for the new + temperature types. + +- **Modified error responses for `amd-smi set` and `amd-smi reset` to display AMD SMI's error codes** + - Error responses now include the explicit AMDSMI status code in square brackets (e.g., `[AMDSMI_STATUS_NOT_SUPPORTED]`) before the error message for each GPU, providing clear context on the type of failure. + - This change is intended to help provide more context on the failure and why the failure occurred. + - **How to interpret error codes:** + - If you see `[AMDSMI_STATUS_NOT_SUPPORTED]`, the device does not support the requested operation and no action is taken. + - If you see `[AMDSMI_STATUS_INVAL]`, user provided invalid parameters. + - If you see `[AMDSMI_STATUS_BUSY]`, device is busy and cannot process this request + - For other codes, refer to our documentation for details. [Link to `enum amdsmi_status_t` documentation.](https://rocm.docs.amd.com/projects/amdsmi/en/amd-staging/doxygen/docBin/html/amdsmi_8h.html#ab05c37a8d1e512898eef2d25fb9fe06b) + - Example scenarios: + - **Navi System:** + Attempting to change partitions on a Navi system will result in a "not supported" response, since Navi does not support partitions. + + ```console + $ sudo amd-smi set -M NPS2 + + ******WARNING****** + + After changing memory (NPS) partition modes, users MUST restart + (reload) the AMD GPU driver. This command NO LONGER AUTOMATICALLY + reloads the driver, see `amd-smi reset -h` and + `sudo amd-smi reset -r` for more information. + + This change is intended to allow users the ability to control when is + the best time to restart the AMD GPU driver, as it may not be desired + to restart the AMD GPU driver immediately after changing the + memory (NPS) partition mode. + + Please use `sudo amd-smi reset -r` AFTER successfully + changing the memory (NPS) partition mode. A successful driver reload + is REQUIRED in order to complete updating ALL GPUs in the hive to + the requested partition mode. + + ******REMINDER****** + In order to reload the AMD GPU driver, users MUST quit all GPU + workloads across all devices. + + Do you accept these terms? [Y/N] y + + GPU: 0 + MEMORY_PARTITION: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to set memory partition to NPS2 + + GPU: 1 + MEMORY_PARTITION: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to set memory partition to NPS2 + ``` + + - **MI3x System in DPX Mode:** + Restricting the power limit on a MI3x device in DPX mode will show "not supported" for logical devices, as only the primary device can accept the change. + + ```console + $ sudo amd-smi set --power-cap 700 + GPU: 0 + POWERCAP: Successfully set power cap to 700W + + GPU: 1 + POWERCAP: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to set power cap to 700W + + GPU: 2 + POWERCAP: Successfully set power cap to 700W + + GPU: 3 + POWERCAP: [AMDSMI_STATUS_NOT_SUPPORTED] Unable to set power cap to 700W + ... + ``` + +### Removed + +- **Removed unnecessary API, `amdsmi_free_name_value_pairs(),` from amdsmi.h** + - This API is only used internally to free up memory from the python interface and does not need to be + exposed to the User. + +- **Removed unused definitions** + - `AMDSMI_MAX_NAME` + - `AMDSMI_256_LENGTH` + - `AMDSMI_MAX_DATE_LENGTH` + - `MAX_AMDSMI_NAME_LENGTH` + - `AMDSMI_LIB_VERSION_YEAR` + - `AMDSMI_DEFAULT_VARIANT` + - `AMDSMI_MAX_NUM_POWER_PROFILES` + - `AMDSMI_MAX_DRIVER_VERSION_LENGTH` + +- **Removed unused member `year` in struct `amdsmi_version_t`** + +- **Removed `amdsmi_io_link_type_t` and replaced with `amdsmi_link_type_t`**. + - `amdsmi_io_link_type_t` is no longer needed as `amdsmi_link_type_t` is sufficient. + - Mapping from `amdsmi_io_link_type_t` to `amdsmi_link_type_t` is as follows: + + ```console + AMDSMI_IOLINK_TYPE_UNDEFINED == AMDSMI_LINK_TYPE_INTERNAL + AMDSMI_IOLINK_TYPE_PCIEXPRESS == AMDSMI_LINK_TYPE_PCIE + AMDSMI_IOLINK_TYPE_XGMI == AMDSMI_LINK_TYPE_XGMI + ``` + + - `amdsmi_link_type_t` enum has changed, primarily the ordering of the PCI and XGMI types: + + ```C++ + typedef enum { + AMDSMI_LINK_TYPE_INTERNAL = 0, + AMDSMI_LINK_TYPE_PCIE = 1, + AMDSMI_LINK_TYPE_XGMI = 2, + AMDSMI_LINK_TYPE_NOT_APPLICABLE = 3, + AMDSMI_LINK_TYPE_UNKNOWN = 4 + } amdsmi_link_type_t; + ``` + + - Please note that this change will also affect `amdsmi_link_metrics_t`, where the link_type field changes from `amdsmi_io_link_type_t` to `amdsmi_link_type_t`: + + ```C++ + typedef struct { + uint32_t num_links; //!< number of links + struct _links { + amdsmi_bdf_t bdf; //!< bdf of the destination gpu + uint32_t bit_rate; //!< current link speed in Gb/s + uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s + amdsmi_link_type_t link_type; //!< type of the link + uint64_t read; //!< total data received for each link in KB + uint64_t write; //!< total data transfered for each link in KB + uint64_t reserved[2]; + } links[AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK]; + uint64_t reserved[7]; + } amdsmi_link_metrics_t; + ``` + +- **Removed `amdsmi_get_power_info_v2()`**. + - The amdsmi_get_power_info() has been unified and the v2 function is no longer needed/used. + +- **Removed `AMDSMI_EVT_NOTIF_RING_HANG` event notification type in `amdsmi_evt_notification_type_t`**. + +- **The `amdsmi_get_gpu_vram_info` now provides vendor names as a string**. + - `amdsmi_vram_vendor_type_t` enum structure was removed + - `amdsmi_vram_info_t` member named `amdsmi_vram_vendor_type_t` was changed to a character string + - `amdsmi_get_gpu_vram_info` now no longer requires decoding the vendor name as an enum + +- **Removed backwards compatibility `amdsmi_get_gpu_metrics_info()`'s `jpeg_activity` or `vcn_activity` fields: use `xcp_stats.jpeg_busy` or `xcp_stats.vcn_busy`** + - Backwards compatibility is removed for `jpeg_activity` and `vcn_activity` fields, if the `jpeg_busy` or `vcn_busy` field is available. + - *Reasons for this change:* + - Providing both `vcn_activity`/`jpeg_activity` and XCP (partition) stats `vcn_busy`/`jpeg_busy` caused confusion for users about which field to use. By removing backward compatibility, it is easier to identify the relevant field. + - The `jpeg_busy` field increased in size (for supported ASICs), making backward compatibility unable to fully copy the structure into `jpeg_activity`. + + See below for comparison of updated CLI outputs: + + Original output: + + ```console + $ amd-smi metric --usage + GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, N/A, N/A, N/A] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + GFX_BUSY_INST: + XCP_0: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + VCN_BUSY: + XCP_0: [0 %, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A] + ``` + + New output: + + ```console + $ amd-smi metric --usage + GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [N/A, N/A, N/A, N/A] + JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + GFX_BUSY_INST: + XCP_0: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + VCN_BUSY: + XCP_0: [0 %, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A] + ``` + +### Optimized + +- **Reduced amd-smi's CLI's API calls needed to be called before reading or (re)setting GPU features**. + - Now when users call any amd-smi CLI command, we have reduced the APIs needed to be called. Previously, + when a user would read a GPU's status, (for example) we would poll for other information helpful for our sets/reset + CLI calls. This change will increase overall run-time performance of the CLI tool. + +- **Removed partition information from the default `amd-smi static` CLI command**. + - Users can still retrieve the same data by calling `amd-smi`, `amd-smi static -p`, or `amd-smi partition -c -m`/`sudo amd-smi partition -a`. + ***Reason for this change***: + Reading current_compute_partition may momentarily wake the GPU up. This is due to reading XCD registers, which is expected behavior. Changing partitions is not a trivial operation, `current_compute_partition` SYSFS controls this action. + +- **Optimized CLI command `amd-smi topology` in partition mode**. + - Reduced the number of `amdsmi_topo_get_p2p_status` API calls to one fourth. + +### Resolved issues + +- **Removed duplicated GPU IDs when receiving events using the `amd-smi event` command**. + +- **Fixed `amd-smi monitor` decoder utilization (`DEC%`) not showing up on MI3x ASICs**. + +- **Removed additional output after valid json for `amd-smi partition --json`**. + - Previously, when calling `amd-smi partition --json`, there was additional output after the valid json. + - This has been fixed to only show valid json output. + +### Upcoming changes + +- **`amd-smi metric` will also display gpuboard and baseboard temperatures**. + - This change is meant to follow the API change to amdsmi_get_temp_metric. If these measures are not available due + to hardware incompatibility, then they will simply not be displayed in the results when using the metric command. + +### Known issues + +- `amd-smi monitor` does not work on guest systems + + ```shell + $ amd-smi monitor + AttributeError: 'Namespace' object has no attribute 'violation' + ``` + +## amd_smi_lib for ROCm 6.4.2 + +### Added + +- **Added Compute Unit Occupancy information per process** + Measuring compute units are the best way currently to determine gfx usage on a per process basis + - Added `cu_occupancy` field to `amdsmi_proc_info_t` structure in C & Python APIs, in minor version update + - Added `CU_OCCUPANCY` to `amd-smi process` output. + - Added `CU%` to `amd-smi monitor -q` + +- **Added support to get GPU Board voltage**. + + ```console + $ amd-smi metric --voltage + GPU: 0 + VOLTAGE: + VDDBOARD: 52536 mV + ... + ``` + +- **Added new firmware PLDM_BUNDLE**. + - `amd-smi firmware` can now show the PLDM Bundle on supported systems. + +- **Added `amd-smi ras --afid --cper-file ` to decode CPER records** + - Python and C have added the `amdsmi_get_afids_from_cper()` to decode + +### Changed + +- **Padded `asic_serial` in `amdsmi_get_asic_info` with 0s**. + +- **Renamed fields `COMPUTE_PARTITION` to `ACCELERATOR_PARTITION` in CLI call `amd-smi --partition`**. + - We are changing the field named `COMPUTE_PARTITION` to `ACCELERATOR_PARTITION`. + - API and associated struct naming will remain the same + - Reason(s) for this change: + - Align with host AMD SMI's `static --partition` field naming + - Align with naming seen in `amd-smi partition` + + *Previous Output:* + + ```console + $ amd-smi static --partition + GPU: 0 + PARTITION: + COMPUTE_PARTITION: SPX + MEMORY_PARTITION: NPS1 + PARTITION_ID: 0 + ``` + + *New Output:* + + ```console + $ amd-smi static --partition + GPU: 0 + PARTITION: + ACCELERATOR_PARTITION: SPX + MEMORY_PARTITION: NPS1 + PARTITION_ID: 0 + ``` + +### Removed + +- N/A + +### Optimized + +- N/A + +### Resolved issues + +- **Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`**. + - Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and calculated using KB vs KiB. + +### Upcoming changes + +- N/A + +### Known issues + +- N/A + +## amd_smi_lib for ROCm 6.4.1 + +### Added + +- **Added dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python & C APIs**. + - CPER entries consist of `amdsmi_cper_hdr_t` + + ```C + typedef struct { + char signature[4]; /* "CPER" */ + uint16_t revision; + uint32_t signature_end; /* 0xFFFFFFFF */ + uint16_t sec_cnt; + amdsmi_cper_sev_t error_severity; + //valid_bits_t valid_bits; + //uint32_t valid_mask; + amdsmi_cper_valid_bits_t cper_valid_bits; + uint32_t record_length; /* Total size of CPER Entry */ + amdsmi_cper_timestamp_t timestamp; + char platform_id[16]; + amdsmi_cper_guid_t partition_id; /* Reserved */ + char creator_id[16]; + amdsmi_cper_guid_t notify_type; /* CMC, MCE, can use amdsmi_cper_notifiy_type_t to decode*/ + char record_id[8]; /* Unique CPER Entry ID */ + uint32_t flags; /* Reserved */ + uint64_t persistence_info; /* Reserved */ + uint8_t reserved[12]; /* Reserved */ + } amdsmi_cper_hdr_t; + ``` + + - Dumping CPER entires is also enabled in the CLI interface via `sudo amd-smi ras --cper` + + ```console + $ sudo amd-smi ras --cper + Dumping CPER file header entries for GPU 0: + "0": { + "error_severity": "non_fatal_corrected", + "notify_type": "CMC", + "timestamp": "2025/04/08 18:23:44", + "signature": "CPER", + "revision": 256, + "signature_end": "0xffffffff", + "sec_cnt": 1, + "record_length": 472, + "platform_id": "0x1002:0x74A2", + "creator_id": "amdgpu", + "record_id": "5:1", + "flags": 0, + "persistence_info": 0 + } + ``` + +- **Added `amdsmi_get_gpu_busy_percent` to the C API**. + - This function retrieves the GPU busy percentage from the `gpu_busy_percent` sysfs file. + +### Changed + +- **Modified VRAM display for `amd-smi monitor -v`**. + - Added free VRAM and VRAM percentage. + + ```console + $ amd-smi monitor -v + GPU VRAM_USED VRAM_FREE VRAM_TOTAL VRAM% + 0 174 MB 16011 MB 16185 MB 0.01 % + 1 78 MB 347 MB 425 MB 0.18 % + ... + ``` + +### Removed + +- N/A + +### Optimized + +- **Improved load times for CLI commands when the GPU has multiple partitions**. + +### Resolved issues + +- **Fixed partition enumeration - `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`'s `amdsmi_enumeration_info_t` `drm_card` and `drm_render` fields** + Previously, partitions incorrectly reflected the primary node (1st GPU) and showed the DRM Render Minor as renderD128. Partition nodes mirrored renderD128's information, which was incorrect. See the "*Previous Outputs in CPX*" example below. + + Device enumeration was updated to correctly map DRM Render Minor paths. See the "*Corrected Outputs in CPX*" example below. + + These changes impact what information is readable/writable for the partition nodes. + + ***Example: Previous Outputs in CPX*** + + ```console + $ amd-smi list -e + GPU: 0 + BDF: 0000:0c:00.0 + UUID: + KFD_ID: 18421 + NODE_ID: 2 + PARTITION_ID: 0 + RENDER: renderD128 + CARD: card0 + HSA_ID: 2 + HIP_ID: 0 + HIP_UUID: + + GPU: 1 + BDF: 0000:0c:00.1 + UUID: + KFD_ID: 48116 + NODE_ID: 3 + PARTITION_ID: 1 + RENDER: N/A + CARD: N/A + HSA_ID: 3 + HIP_ID: 1 + HIP_UUID: GPU- + ... + ``` + + ```console + $ amd-smi monitor + GPU POWER GPU_T MEM_T GFX_CLK GFX% MEM% ENC% DEC% VRAM_USAGE + 0 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 1 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 2 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 3 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 4 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 5 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 6 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 7 201 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 8 210 W 46 °C 42 °C 2104 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + ... + ``` + + ***Example: Corrected outputs in CPX*** + + ```console + $ amd-smi list -e + GPU: 0 + BDF: 0000:0c:00.0 + UUID: + KFD_ID: 18421 + NODE_ID: 2 + PARTITION_ID: 0 + RENDER: renderD128 + CARD: card0 + HSA_ID: 2 + HIP_ID: 0 + HIP_UUID: GPU- + + GPU: 1 + BDF: 0000:0c:00.1 + UUID: + KFD_ID: 48116 + NODE_ID: 3 + PARTITION_ID: 1 + RENDER: renderD129 + CARD: card1 + HSA_ID: 3 + HIP_ID: 1 + HIP_UUID: GPU- + ... + ``` + + ```console + $ amd-smi monitor + GPU POWER GPU_T MEM_T GFX_CLK GFX% MEM% ENC% DEC% VRAM_USAGE + 0 202 W 46 °C 42 °C 2107 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + 1 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 2 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 3 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 4 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 5 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 6 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 7 N/A N/A N/A N/A N/A N/A N/A N/A 0.5/ 24.0 GB + 8 210 W 46 °C 42 °C 2104 MHz 0 % 0 % N/A 0 % 0.3/192.0 GB + ... + ``` + +### Upcoming changes + +- N/A + +### Known issues + +- N/A + +## amd_smi_lib for ROCm 6.4.0 + +### Added + +- **Added enumeration mapping `amdsmi_get_gpu_enumeration_info()` to Python & C APIs**. + - Enumeration mapping consists of `amdsmi_enumeration_info_t` + + ```C + typedef struct { + uint32_t drm_render; // the render node under /sys/class/drm/renderD* + uint32_t drm_card; // the graphic card device under /sys/class/drm/card* + uint32_t hsa_id; // the HSA enumeration ID + uint32_t hip_id; // the HIP enumeration ID + char hip_uuid[AMDSMI_MAX_STRING_LENGTH]; // the HIP unique identifier + } amdsmi_enumeration_info_t; + ``` + + - The mapping is also enabled in the CLI interface via `amd-smi list -e` + + ```console + $ amd-smi list -e + GPU: 0 + BDF: 0000:23:00.0 + UUID: XXXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX + KFD_ID: 45412 + NODE_ID: 1 + PARTITION_ID: 0 + RENDER: renderD128 + CARD: card0 + HSA_ID: 1 + HIP_ID: 0 + HIP_UUID: GPU-XXXXXXXXXXXXXXXX + ``` + +- **Added dynamic virtualization mode detection**. + - Added new C and Python API `amdsmi_get_gpu_virtualization_mode` + - Added new C and Python enum `amdsmi_virtualization_mode_t` + +- **Added TVIOL_ACTIVE to `amd-smi monitor`**. + - Added temperature violation active or not status to `amd-smi monitor`. TVIOL_ACTIVE will be displayed as below: + - True if active + - False if not active + - N/A if not supported. + + Example CLI output: + + ```console + $ amd-smi monitor --viol + GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL + 0 100 % 1 % True 0 % 0 % 0 % + 1 100 % 0 % False 0 % 0 % 0 % + 2 100 % 0 % False 0 % 0 % 0 % + 3 100 % 0 % False 0 % 0 % 0 % + 4 100 % 0 % False 0 % 0 % 0 % + 5 100 % 3 % True 0 % 0 % 0 % + 6 100 % 0 % False 0 % 0 % 0 % + 7 100 % 0 % False 0 % 0 % 0 % + ``` + +- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**. +Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth: + - `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s) + - `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down + - `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status. + +- **Added new API `amdsmi_get_gpu_xgmi_link_status()` and CLI `amd-smi xgmi --link-status`** + + New API is defined as: + + ```C + typedef enum { + AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down + AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up + AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled + } amdsmi_xgmi_link_status_type_t; + + typedef struct { + uint32_t total_links; //!< The total links in the status array + amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS]; + uint64_t reserved[7]; + } amdsmi_xgmi_link_status_t; + + amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status) + ``` + + Example CLI output: + + ```console + $ amd-smi xgmi --link-status + + XGMI LINK STATUS: + bdf link_status + GPU0 0000:08:00.0 U U U U D U D X + GPU1 0000:44:00.0 U U U U D U D X + ... + + * U:Up D:Down X:Disabled + ``` + +- **Added fclk and socclk info to `amd-smi metric -c/--clock`**. + - fclk and socclk information such as min and max clock have been added to the metric command, in line with all the other clocks. + + ```shell + $ amd-smi metric -c -g 1 + ... + FCLK_0: + CLK: 2301 MHz + MIN_CLK: 601 MHz + MAX_CLK: 2301 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + SOCCLK_0: + CLK: 1500 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1500 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + ``` + +- **Added new command `amd-smi set -c/--clock-level`**. + - This new command sets the performance level of the selected clock on the desired GPUs. + - The command can accept a range of acceptable levels, but will not set the level when a level is beyond the number of frequency levels as show in `amd-smi static -C/--clock`. + + ```console + $ sudo amd-smi set -c sclk 5 6 + GPU: 0 + CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6 + +GPU: 1 + CLK_LEVEL: clock level(s) 5, 6 is/are greater than sclk frequency levels supported for device GPU ID: 1 BDF:0000:46:00.0 +``` + +- **Added new command `amd-smi static -C/--clock`**. + - This new command displays the clock frequency performance levels for the selected GPUs and clocks. + + ```console + $ amd-smi static --clock all -g 0 + GPU: 0 + CLOCK: + SYS: + CURRENT LEVEL: 2 + FREQUENCY_LEVELS: + 0: 300 MHz + 1: 904 MHz + 2: 1165 MHz + 3: 1360 MHz + 4: 1440 MHz + 5: 1544 MHz + 6: 1627 MHz + 7: 1720 MHz + 8: 1800 MHz + MEM: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 167 MHz + DF: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 1400 MHz + SOC: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 302 MHz + DCEF: N/A + VCLK0: N/A + VCLK1: N/A + DCLK0: N/A + DCLK1: N/A + ``` + +### Changed + +- **AMDSMI Library Version number to reflect changes in backwards compatibility**. + - Removed Year from AMDSMI Library version number. + - Version changed from 25.2.0.0 (Year.Major.Minor.Patch) to 25.2.0 (Major.Minor.Patch) + - Removed year in all version references + +- **Removed initialization requirements for `amdsmi_get_lib_version()` and added `amdsmi_get_rocm_version()` to the python API & CLI**. + +- **Added `amdsmi_get_power_info_v2()` with `sensor_ind`**. + - Python API now accepts sensor_ind as an optional argument, does not impact previous usage + +- **Deprecated enum `AMDSMI_NORMAL_STRING_LENGTH` in favor of `AMDSMI_MAX_STRING_LENGTH`**. + +- **Changed to use thread local mutex by default**. + - Most sysfs reads do not require cross-process level mutex, and writes to sysfs should be protected by the kernel already. + - Users can still switch to the old behavior by setting the environment variable `AMDSMI_MUTEX_CROSS_PROCESS=1`. + +- **Changed `amdsmi_vram_vendor_type_t` enum names impacting `amdsmi_vram_info_t` structure**. +This also change impacts usage of the vram_vendor output of `amdsmi_get_gpu_vram_info()` + +- **Changed `amdsmi_nps_caps_t` struct impacting `amdsmi_memory_partition_config_t`, `amdsmi_accelerator_partition_t`, `amdsmi_accelerator_partition_profile_config_t`**. +Functions affected by struct change are: + - `amdsmi_get_gpu_memory_partition_config()` + - `amdsmi_get_gpu_accelerator_partition_profile()` + - `amdsmi_get_gpu_accelerator_partition_profile_config()` + +- **Corrected CLI CPU argument name**. + - `--cpu-pwr-svi-telemtry-rails` to `--cpu-pwr-svi-telemetry-rails` + +- **Added amdgpu driver version and amd_hsmp driver version to `amd-smi version` command**. + - The `amd-smi version` command can now also display the amdgpu driver version using the `-g` flag. + - The amd_hsmp driver version can also be displayed using the `-c` flag. + - The new default for the `version` command is to display all the version information, including both amdgpu and amd_hsmp driver versions. + + ```console + $ amd-smi version + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 | amd_hsmp version: 2.2 + + $ amd-smi version -g + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 + + $ amd-smi version -c + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amd_hsmp version: 2.2 + ``` + +- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**. + - Users can only use one set option at a time now. + +- **Python API for `amdsmi_get_energy_count()` will change the name for the `power` field to `energy_accumulator`**. + +- **Added violation status output for Graphics Clock Below Host Limit to our CLI: `amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`**. + ***Only available for MI300+ ASICs.*** + Users can retrieve violation status' through either our Python or C++ APIs. + Additionally, we have added capability to view these outputs conveniently through `amd-smi metric --throttle` and `amd-smi monitor --violation`. + Example outputs are listed below (below is for reference, output is subject to change): + + ```console + $ amd-smi monitor --violation + GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL + 0 0 % 0 % False 0 % 0 % 0 % 0 % + 1 0 % 0 % False 0 % 0 % 0 % 0 % + ... + ``` + + ```console + $ amd-smi metric --throttle + GPU: 0 + THROTTLE: + ACCUMULATION_COUNTER: 11240028 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 0 + SOCKET_THERMAL_ACCUMULATED: 0 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: N/A + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: N/A + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 % + + GPU: 1 + THROTTLE: + ACCUMULATION_COUNTER: 11238232 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 0 + SOCKET_THERMAL_ACCUMULATED: 0 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: 0 + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 % + ... + ``` + +- **Updated API `amdsmi_get_violation_status()` structure and CLI `amdsmi_violation_status_t` to include GFX Clk below host limit** + Updated structure `amdsmi_violation_status_t`: + + ```C + typedef struct { + ... + uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported + ... + uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported + ... + uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported + ... + } amdsmi_violation_status_t; + ``` + +- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`** + Updated structure `amdsmi_vram_info_t`: + + ```C + typedef struct { + amdsmi_vram_type_t vram_type; + amdsmi_vram_vendor_type_t vram_vendor; + uint64_t vram_size; + uint32_t vram_bit_width; + uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s) + uint64_t reserved[4]; + } amdsmi_vram_info_t; + + amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info) + ``` + + Example CLI output: + + ```console + $ amd-smi static --vram + GPU: 0 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 16368 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s + GPU: 1 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 30704 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s + ... + ``` + +- **Changed amd-smi partition --accelerator & `amdsmi_get_gpu_accelerator_partition_profile_config()` detect users running without root/sudo permissions** + - Updated `amdsmi_get_gpu_accelerator_partition_profile_config()` to return `AMDSMI_STATUS_NO_PERM` immediately if users run without root/sudo permissions. + - Updated `amd-smi partition --accelerator` to provide a warning for users without root/sudo permissions (see example below, ***output subject to change***). + + ```console + $ amd-smi partition --accelerator + + ACCELERATOR_PARTITION_PROFILES: + + *************************************************************************** + ** WARNING: ** + ** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** + ** Please run the command with sudo permissions to get accurate results. ** + *************************************************************************** + + GPU_ID PROFILE_INDEX MEMORY_PARTITION_CAPS ACCELERATOR_TYPE PARTITION_ID NUM_PARTITIONS NUM_RESOURCES RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + + ACCELERATOR_PARTITION_RESOURCES: + RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + N/A N/A N/A N/A + + + Legend: + * = Current mode + ``` + +- **Changed `amd-smi partition --current`, `amd-smi partition --accelerator`, and `amdsmi_get_gpu_accelerator_partition_profile()` to display partition ID for each individual partition** + - Host will continue to display in the full array format, they do not display the individual partitions as Baremetal/Guest setups. + - Baremetal and Guest MI3x setups will change to reflect each individual partition ID, now provided in `partition_id[0]` location (as seen in other amd-smi CLI commands). + - This change was needed for BM/Guest setups due to other related partition outputs seen in (`amd-smi list` and `amd-smi static --partition`) and individual logical partition devices displayed. + + Previous output: + + ```console + $ amd-smi partition --current + + CURRENT_PARTITION: + GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID + 0 NPS1 CPX 3 0,1,2,3,4,5,6,7 + 1 NPS1 CPX 3 N/A + 2 NPS1 CPX 3 N/A + 3 NPS1 CPX 3 N/A + 4 NPS1 CPX 3 N/A + 5 NPS1 CPX 3 N/A + 6 NPS1 CPX 3 N/A + 7 NPS1 CPX 3 N/A + 8 NPS1 CPX 3 0,1,2,3,4,5,6,7 + 9 NPS1 CPX 3 N/A + 10 NPS1 CPX 3 N/A + ... + ``` + + New output: + + ```console + amd-smi partition --current + CURRENT_PARTITION: + GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID + 0 NPS1 CPX 3 0 + 1 NPS1 CPX 3 1 + 2 NPS1 CPX 3 2 + 3 NPS1 CPX 3 3 + 4 NPS1 CPX 3 4 + 5 NPS1 CPX 3 5 + 6 NPS1 CPX 3 6 + 7 NPS1 CPX 3 7 + 8 NPS1 CPX 3 0 + 9 NPS1 CPX 3 1 + 10 NPS1 CPX 3 2 + ... + ``` + +### Removed + +- **Removed `GFX_BUSY_ACC` from `amd-smi metric --usage`**. + - Displaying `GFX_BUSY_ACC` does not provide helpful outputs for users. + + Old output: + + ```console + $ amd-smi metric --usage + GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + GFX_BUSY_INST: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + VCN_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %] + GFX_BUSY_ACC: + XCP_0: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + ... + ``` + + New Output: + + ```console + $ amd-smi metric --usage + GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + GFX_BUSY_INST: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %] + VCN_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %] + ... + ``` + +### Optimized + +- **Added additional help information to `amd-smi set --help` command**. + - sub commands now detail what values are acceptable as input. These include: + - `amd-smi set --perf-level` with performance levels + - `amd-smi set --profile` with power profiles + - `amd-smi set --perf-determinism` with preset GPU frequency limits + - `amd-smi set --power-cap` with valid power cap values + - `amd-smi set --soc-pstate` with soc pstate policy ids + - `amd-smi set --xgmi-plpd` with xgmi per link power down policy ids + +- **Modified `amd-smi` CLI to allow case insensitive arguments if the argument does not begin with a single dash**. + - With this change `amd-smi version` and `amd-smi VERSION` will now yield the same output. + - `amd-smi static --bus` and `amd-smi STATIC --BUS` will produce identical results. + - `amd-smi static -b` and `amd-smi static -B` will still return different results (-b for bus and -B for board). + +- **Converted xgmi read and write from KB's to readable units**. + - With this change `amd-smi xgmi` will now display the statistics in dynamically selected readable units. + - Example output CLI output: + + ```console + $ amd-smi xgmi + LINK METRIC TABLE: + bdf bit_rate max_bandwidth link_type 0000:05:00.0 0000:26:00.0 0000:46:00.0 0000:65:00.0 0000:85:00.0 0000:a6:00.0 0000:c6:00.0 0000:e5:00.0 + GPU0 0000:05:00.0 32 Gb/s 512 Gb/s XGMI + Read N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + GPU1 0000:26:00.0 32 Gb/s 512 Gb/s XGMI + Read 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + GPU2 0000:46:00.0 32 Gb/s 512 Gb/s XGMI + Read 1.123 PB 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write 229.1 MB 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + ... + ``` + +### Resolved issues + +- **Fixed `amd-smi static --partition` for guest systems with MIx ASICs being unable to run** + +- **Fixed `amdsmi_get_gpu_asic_info` and `amd-smi static --asic` not displaying graphics version properly for MI2x, MI1x or Navi 3x ASICs**. + + Before on MI100: + + ```console + $ amd-smi static --asic | grep TARGET_GRAPHICS_VERSION + TARGET_GRAPHICS_VERSION: gfx9008 + TARGET_GRAPHICS_VERSION: gfx9008 + ``` + + After on MI100: + + ```console + $ amd-smi static --asic | grep TARGET_GRAPHICS_VERSION + TARGET_GRAPHICS_VERSION: gfx908 + TARGET_GRAPHICS_VERSION: gfx908 + ``` + +- **Fixed `amd-smi static --partition` for guest systems with MIx ASICs being unable to run** + +### Upcoming changes + +- **Deprication in ROCm 7.0 of the `AMDSMI_LIB_VERSION_YEAR` enum and API fields**. + +- **Deprication in ROCm 7.0 of the `pasid` field within struct `amdsmi_process_info_t`** + +### Known issues + +- **AMD SMI only reports 63 GPU devices when setting CPX on all 8 GPUs** + When setting CPX as a partition mode, there is a DRM node limitation of 64. + This is a known limitation of the Linux kernel, not the driver. Other drivers, such as those using PCIe space (e.g., ast), may be occupying the necessary DRM nodes. + The number of DRM nodes used can be checked via `ls /sys/class/drm` + + - References to kernel changes: + - [Updates to number of node](https://cgit.freedesktop.org/drm/libdrm/commit/?id=7130cb163eb860d4a965c6708b64fe87cee881d6) + - [Identification of node type](https://cgit.freedesktop.org/drm/libdrm/commit/?id=3bc3cca230c5a064b2f554f26fdec27db0f5ead8) + + Options are as follows: + 1) ***Workaround - removing other devices using DRM nodes*** + + Recommended steps for removing unnecessary drivers: + a. Unload amdgpu - `sudo rmmod amdgpu` + b. Remove unnecessary driver(s) - ex. `sudo rmmod ast` + c. Reload amgpu - `sudo modprobe amdgpu` + d. Confirm `amd-smi list` reports all nodes (this can vary per MI ASIC) + + 2) ***Update your OS' kernel*** + 3) ***Building and installing your own kernel*** + +## amd_smi_lib for ROCm 6.3.1 + +### Added + +### Changed + +- **Changed `amd-smi monitor`: No longer display `ENC_CLOCK`/`DEC_CLOCK` but `VCLOCK` and `DCLOCK`**. + Due to fix mentioned in `Resolved Issues`, this change was needed. + Reason: Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported. + Before: + + ```console + $ amd-smi monitor -n -d + GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK + 0 0.0 % 29 MHz N/A 22 MHz + 1 0.0 % 29 MHz N/A 22 MHz + 2 0.0 % 29 MHz N/A 22 MHz + 3 0.0 % 29 MHz N/A 22 MHz + 4 0.0 % 29 MHz N/A 22 MHz + 5 0.0 % 29 MHz N/A 22 MHz + 6 0.0 % 29 MHz N/A 22 MHz + 7 0.0 % 29 MHz N/A 22 MHz + ``` + + After: + + ```console + $ amd-smi monitor -n -d + GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK + 0 N/A 0.0 % 29 MHz 22 MHz + 1 N/A 0.0 % 29 MHz 22 MHz + 2 N/A 0.0 % 29 MHz 22 MHz + 3 N/A 0.0 % 29 MHz 22 MHz + 4 N/A 0.0 % 29 MHz 22 MHz + 5 N/A 0.0 % 29 MHz 22 MHz + 6 N/A 0.0 % 29 MHz 22 MHz + 7 N/A 0.0 % 29 MHz 22 MHz + ``` + +### Removed + +### Optimized + +### Resolved issues + +- **Fixed `amd-smi monitor`'s encode/decode: `ENC_UTIL`, `DEC_UTIL`, and now associate `VCLOCK`/`DCLOCK` with both**. + Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported. + + Navi products cannot support displaying ENC_UTIL % at this time. + + Before: + ```console + $ amd-smi monitor -n -d + GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK + 0 0.0 % 29 MHz N/A 22 MHz + 1 0.0 % 29 MHz N/A 22 MHz + 2 0.0 % 29 MHz N/A 22 MHz + 3 0.0 % 29 MHz N/A 22 MHz + 4 0.0 % 29 MHz N/A 22 MHz + 5 0.0 % 29 MHz N/A 22 MHz + 6 0.0 % 29 MHz N/A 22 MHz + 7 0.0 % 29 MHz N/A 22 MHz + ``` + + After: + ```console + $ amd-smi monitor -n -d + GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK + 0 N/A 0.0 % 29 MHz 22 MHz + 1 N/A 0.0 % 29 MHz 22 MHz + 2 N/A 0.0 % 29 MHz 22 MHz + 3 N/A 0.0 % 29 MHz 22 MHz + 4 N/A 0.0 % 29 MHz 22 MHz + 5 N/A 0.0 % 29 MHz 22 MHz + 6 N/A 0.0 % 29 MHz 22 MHz + 7 N/A 0.0 % 29 MHz 22 MHz + ``` + +### Upcoming changes + +### Known issues + +## amd_smi_lib for ROCm 6.3.0 + +### Added + +- **Added support for `amd-smi metric --ecc` & `amd-smi metric --ecc-blocks` on Guest VMs**. +Guest VMs now support getting current ECC counts and ras information from the Host cards. + +- **Added support for GPU metrics 1.6 to `amdsmi_get_gpu_metrics_info()`**. +Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery: + - `uint64_t accumulation_counter` - used for all throttled calculations + - `uint64_t prochot_residency_acc` - Processor hot accumulator + - `uint64_t ppt_residency_acc` - Package Power Tracking (PPT) accumulator (used in PVIOL calculations) + - `uint64_t socket_thm_residency_acc` - Socket thermal accumulator - (used in TVIOL calculations) + - `uint64_t vr_thm_residency_acc` - Voltage Rail (VR) thermal accumulator + - `uint64_t hbm_thm_residency_acc` - High Bandwidth Memory (HBM) thermal accumulator + - `uint16_t num_partition` - corresponds to the current total number of partitions + - `struct amdgpu_xcp_metrics_t xcp_stats[MAX_NUM_XCP]` - for each partition associated with current GPU, provides gfx busy & accumulators, jpeg, and decoder (VCN) engine utilizations + - `uint32_t gfx_busy_inst[MAX_NUM_XCC]` - graphic engine utilization (%) + - `uint16_t jpeg_busy[MAX_NUM_JPEG_ENGS]` - jpeg engine utilization (%) + - `uint16_t vcn_busy[MAX_NUM_VCNS]` - decoder (VCN) engine utilization (%) + - `uint64_t gfx_busy_acc[MAX_NUM_XCC]` - graphic engine utilization accumulated (%) + - `uint32_t pcie_lc_perf_other_end_recovery` - corresponds to the pcie other end recovery counter + +- **Added new violation status outputs and APIs: `amdsmi_status_t amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`**. + ***Only available for MI300+ ASICs.*** + Users can now retrieve violation status' through either our Python or C++ APIs. Additionally, we have + added capability to view these outputs conviently through `amd-smi metric --throttle` and `amd-smi monitor --violation`. + Example outputs are listed below (below is for reference, output is subject to change): + +```shell +$ amd-smi metric --throttle +GPU: 0 + THROTTLE: + ACCUMULATION_COUNTER: 3808991 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 585613 + SOCKET_THERMAL_ACCUMULATED: 2190 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + + + +GPU: 1 + THROTTLE: + ACCUMULATION_COUNTER: 3806335 + PROCHOT_ACCUMULATED: 0 + PPT_ACCUMULATED: 586332 + SOCKET_THERMAL_ACCUMULATED: 18010 + VR_THERMAL_ACCUMULATED: 0 + HBM_THERMAL_ACCUMULATED: 0 + PROCHOT_VIOLATION_STATUS: NOT ACTIVE + PPT_VIOLATION_STATUS: NOT ACTIVE + SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE + VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE + HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE + PROCHOT_VIOLATION_ACTIVITY: 0 % + PPT_VIOLATION_ACTIVITY: 0 % + SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 % + VR_THERMAL_VIOLATION_ACTIVITY: 0 % + HBM_THERMAL_VIOLATION_ACTIVITY: 0 % + +... +``` + +```shell +$ amd-smi monitor --violation +GPU PVIOL TVIOL PHOT_TVIOL VR_TVIOL HBM_TVIOL + 0 0 % 0 % 0 % 0 % 0 % + 1 0 % 0 % 0 % 0 % 0 % + 2 0 % 0 % 0 % 0 % 0 % + 3 0 % 0 % 0 % 0 % 0 % + 4 0 % 0 % 0 % 0 % 0 % + 5 0 % 0 % 0 % 0 % 0 % + 6 0 % 0 % 0 % 0 % 0 % + 7 0 % 0 % 0 % 0 % 0 % + 8 0 % 0 % 0 % 0 % 0 % + 9 0 % 0 % 0 % 0 % 0 % + 10 0 % 0 % 0 % 0 % 0 % + 11 0 % 0 % 0 % 0 % 0 % + 12 0 % 0 % 0 % 0 % 0 % + 13 0 % 0 % 0 % 0 % 0 % + 14 0 % 0 % 0 % 0 % 0 % + 15 0 % 0 % 0 % 0 % 0 % +... +``` + +- **Added ability to view XCP (Graphics Compute Partition) activity within `amd-smi metric --usage`**. + ***Partition specific features are only available on MI300+ ASICs*** + Users can now retrieve graphic utilization statistic on a per-XCP (per-partition) basis. Here all XCP activities will be listed, + but the current XCP is the partition id listed under both `amd-smi list` and `amd-smi static --partition`. + Example outputs are listed below (below is for reference, output is subject to change): + +```shell +$ amd-smi metric --usage +GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, N/A, N/A, N/A] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + GFX_BUSY_INST: + XCP_0: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_2: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_3: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_4: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_5: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_6: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_7: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + VCN_BUSY: + XCP_0: [0 %, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A] + GFX_BUSY_ACC: + XCP_0: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + +GPU: 1 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, N/A, N/A, N/A] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + GFX_BUSY_INST: + XCP_0: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + JPEG_BUSY: + XCP_0: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_1: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_2: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_3: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_4: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_5: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_6: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + XCP_7: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, + N/A, N/A, N/A] + VCN_BUSY: + XCP_0: [0 %, N/A, N/A, N/A] + XCP_1: [0 %, N/A, N/A, N/A] + XCP_2: [0 %, N/A, N/A, N/A] + XCP_3: [0 %, N/A, N/A, N/A] + XCP_4: [0 %, N/A, N/A, N/A] + XCP_5: [0 %, N/A, N/A, N/A] + XCP_6: [0 %, N/A, N/A, N/A] + XCP_7: [0 %, N/A, N/A, N/A] + GFX_BUSY_ACC: + XCP_0: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_1: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_2: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_3: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_4: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_5: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_6: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + XCP_7: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] + +... +``` + +- **Added `LC_PERF_OTHER_END_RECOVERY` CLI output to `amd-smi metric --pcie` and updated `amdsmi_get_pcie_info()` to include this value**. + ***Feature is only available on MI300+ ASICs*** + Users can now retrieve both through `amdsmi_get_pcie_info()` which has an updated structure: + +```C +typedef struct { + ... + struct pcie_metric_ { + uint16_t pcie_width; //!< current PCIe width + uint32_t pcie_speed; //!< current PCIe speed in MT/s + uint32_t pcie_bandwidth; //!< current instantaneous PCIe bandwidth in Mb/s + uint64_t pcie_replay_count; //!< total number of the replays issued on the PCIe link + uint64_t pcie_l0_to_recovery_count; //!< total number of times the PCIe link transitioned from L0 to the recovery state + uint64_t pcie_replay_roll_over_count; //!< total number of replay rollovers issued on the PCIe link + uint64_t pcie_nak_sent_count; //!< total number of NAKs issued on the PCIe link by the device + uint64_t pcie_nak_received_count; //!< total number of NAKs issued on the PCIe link by the receiver + uint32_t pcie_lc_perf_other_end_recovery_count; //!< PCIe other end recovery counter + uint64_t reserved[12]; + } pcie_metric; + uint64_t reserved[32]; +} amdsmi_pcie_info_t; +``` + + - Example outputs are listed below (below is for reference, output is subject to change): + +```shell +$ amd-smi metric --pcie +GPU: 0 + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 18 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 0 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + LC_PERF_OTHER_END_RECOVERY: 0 + +GPU: 1 + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + BANDWIDTH: 18 Mb/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 0 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A + LC_PERF_OTHER_END_RECOVERY: 0 +... +``` + +- **Added retrieving a set of GPUs that are nearest to a given device at a specific link type level**. + - Added `amdsmi_get_link_topology_nearest()` function to amd-smi C and Python Libraries. + +- **Added more supported utilization count types to `amdsmi_get_utilization_count()`**. + +- **Added `amd-smi set -L/--clk-limit ...` command**. + Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency. + +- **Added unittest functionality to test amdsmi API calls in Python**. + +- **Changed the `power` parameter in `amdsmi_get_energy_count()` to `energy_accumulator`**. + - Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. + +- **Added GPU memory overdrive percentage to `amd-smi metric -o`**. + - Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. + +- **Added retrieving connection type and P2P capabilities between two GPUs**. + - Added `amdsmi_topo_get_p2p_status()` function to amd-smi C and Python Libraries. + - Added retrieving P2P link capabilities to CLI `amd-smi topology`. + +```shell +$ amd-smi topology -h +usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...]] [-a] [-w] [-o] [-t] [-b] + +If no GPU is specified, returns information for all GPUs on the system. +If no topology argument is provided all topology information will be displayed. + +Topology arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:0c:00.0 | UUID: + ID: 1 | BDF: 0000:22:00.0 | UUID: + ID: 2 | BDF: 0000:38:00.0 | UUID: + ID: 3 | BDF: 0000:5c:00.0 | UUID: + ID: 4 | BDF: 0000:9f:00.0 | UUID: + ID: 5 | BDF: 0000:af:00.0 | UUID: + ID: 6 | BDF: 0000:bf:00.0 | UUID: + ID: 7 | BDF: 0000:df:00.0 | UUID: + all | Selects all devices + + -a, --access Displays link accessibility between GPUs + -w, --weight Displays relative weight between GPUs + -o, --hops Displays the number of hops between GPUs + -t, --link-type Displays the link type between GPUs + -b, --numa-bw Display max and min bandwidth between nodes + -c, --coherent Display cache coherant (or non-coherant) link capability between nodes + -n, --atomics Display 32 and 64-bit atomic io link capability between nodes + -d, --dma Display P2P direct memory access (DMA) link capability between nodes + -z, --bi-dir Display P2P bi-directional link capability between nodes + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +```shell +$ amd-smi topology -cndz +CACHE COHERANCY TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF C NC NC C C C NC +0000:22:00.0 C SELF NC C C C NC C +0000:38:00.0 NC NC SELF C C NC C NC +0000:5c:00.0 NC C C SELF NC C NC NC +0000:9f:00.0 C C C NC SELF NC NC C +0000:af:00.0 C C NC C NC SELF C C +0000:bf:00.0 C NC C NC NC C SELF NC +0000:df:00.0 NC C NC NC C C NC SELF + +ATOMICS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF 64,32 64,32 64 32 32 N/A 64,32 +0000:22:00.0 64,32 SELF 64 32 32 N/A 64,32 64,32 +0000:38:00.0 64,32 64 SELF 32 N/A 64,32 64,32 64,32 +0000:5c:00.0 64 32 32 SELF 64,32 64,32 64,32 32 +0000:9f:00.0 32 32 N/A 64,32 SELF 64,32 32 32 +0000:af:00.0 32 N/A 64,32 64,32 64,32 SELF 32 N/A +0000:bf:00.0 N/A 64,32 64,32 64,32 32 32 SELF 64,32 +0000:df:00.0 64,32 64,32 64,32 32 32 N/A 64,32 SELF + +DMA TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF T T F F T F T +0000:22:00.0 T SELF F F T F T T +0000:38:00.0 T F SELF T F T T T +0000:5c:00.0 F F T SELF T T T F +0000:9f:00.0 F T F T SELF T F F +0000:af:00.0 T F T T T SELF F T +0000:bf:00.0 F T T T F F SELF F +0000:df:00.0 T T T F F T F SELF + +BI-DIRECTIONAL TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF T T F F T F T +0000:22:00.0 T SELF F F T F T T +0000:38:00.0 T F SELF T F T T T +0000:5c:00.0 F F T SELF T T T F +0000:9f:00.0 F T F T SELF T F F +0000:af:00.0 T F T T T SELF F T +0000:bf:00.0 F T T T F F SELF F +0000:df:00.0 T T T F F T F SELF + +Legend: + SELF = Current GPU + ENABLED / DISABLED = Link is enabled or disabled + N/A = Not supported + T/F = True / False + C/NC = Coherant / Non-Coherant io links + 64,32 = 64 bit and 32 bit atomic support + - +``` + +- **Created new amdsmi_kfd_info_t and added information under `amd-smi list`**. + - Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers were added in to a new `amdsmi_kfd_info_t` which gets populated via the API `amdsmi_get_gpu_kfd_info()`. + - This info has been added to the `amd-smi list`. + - These new fields are only available for BM/Guest Linux devices at this time. + +```C +typedef struct { + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t current_partition_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[12]; +} amdsmi_kfd_info_t; +``` + +```shell +$ amd-smi list +GPU: 0 + BDF: 0000:23:00.0 + UUID: + KFD_ID: 45412 + NODE_ID: 1 + PARTITION_ID: 0 + +GPU: 1 + BDF: 0000:26:00.0 + UUID: + KFD_ID: 59881 + NODE_ID: 2 + PARTITION_ID: 0 +``` + +- **Added Subsystem Device ID to `amd-smi static --asic`**. + - No underlying changes to amdsmi_get_gpu_asic_info + +```shell +$ amd-smi static --asic +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +- **Added Target_Graphics_Version to `amd-smi static --asic` and `amdsmi_get_gpu_asic_info()`**. + +```C +typedef struct { + char market_name[AMDSMI_256_LENGTH]; + uint32_t vendor_id; //< Use 32 bit to be compatible with other platform. + char vendor_name[AMDSMI_MAX_STRING_LENGTH]; + uint32_t subvendor_id; //< The subsystem vendor id + uint64_t device_id; //< The device id of a GPU + uint32_t rev_id; + char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; + uint32_t oam_id; //< 0xFFFF if not supported + uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported + uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t reserved[15]; +} amdsmi_asic_info_t; +``` + +```shell +$ amd-smi static --asic +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +### Changed + +- **Improvement: Users now have the ability to set and reset without providing `-g all` using AMD SMI CLI**. +Users can now provide set and reset without `-g all`. Previously, users were required to provide: +`sudo amd-smi set -g all ` or `sudo amd-smi reset -g all ` +This update allows users to set or reset without providing `-g all` arguments. Allowing commands: +`sudo amd-smi set ` or `sudo amd-smi reset ` +This action will default to try to set/reset for all AMD GPUs on the user's system. + +- **Improvement: `amd-smi set --memory-partition` now includes a warning banner and progress bar**. +For devices which support dynamically changing memory partitions, we now provide a warning for users. We provide this warning to provide users knowledge that this action requires users to quit any gpu workloads. Also to let them know this process will trigger an AMD GPU driver reload. Since this process takes time to complete, a progress bar has been provided until actions can verified as a successful change. Otherwise, AMD SMI will report any errors to users and what actions can be taken. See example below: +```shell +$ sudo amd-smi set -M NPS1 + + ****** WARNING ****** + + Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads. + AMD SMI will then attempt to change memory (NPS) partition mode. + Upon a successful set, AMD SMI will then initiate an action to restart amdgpu driver. + This action will change all GPU's in the hive to the requested memory (NPS) partition mode. + + Please use this utility with caution. + +Do you accept these terms? [Y/N] y + +Updating memory partition for gpu 0: [████████████████████████████████████████] 40/40 secs remain + +GPU: 0 + MEMORYPARTITION: Successfully set memory partition to NPS1 + +GPU: 1 + MEMORYPARTITION: Successfully set memory partition to NPS1 + +GPU: 2 + MEMORYPARTITION: Successfully set memory partition to NPS1 +... +``` + +- **Updated `amdsmi_get_gpu_accelerator_partition_profile` to provide driver memory partition capablities**. +Driver now has the ability to report what the user can set memory partition modes to. User can now see available +memory partition modes upon an invalid argument return from memory partition mode set (`amdsmi_set_gpu_memory_partition`). +This change also updates `amd-smi partition`, `amd-smi partition --memory`, and `amd-smi partition --accelerator` (*see note below) +***Note: *Subject to change for ROCm 6.4*** + +- **Updated `amdsmi_set_gpu_memory_partition` to not return until a successful restart of AMD GPU Driver**. +This change keeps checking for ~ up to 40 seconds for a successful restart of the AMD GPU driver. Additionally, the API call continues to check if memory partition (NPS) SYSFS files are successfully updated to reflect the user's requested memory partition (NPS) mode change. Otherwise, reports an error back to the user. Due to these changes, we have updated AMD SMI's CLI to reflect the maximum wait of 40 seconds, while a memory partition change is in progress. + +- **All APIs now have the ability to catch driver reporting invalid arguments**. +Now AMD SMI APIs can show AMDSMI_STATUS_INVAL when driver returns EINVAL. +For example, if user tries to set to NPS8, but the memory partition mode is not an available mode to set to. Commonly referred to as `CAPS` (see `amd-smi partition --memory`), provided by `amdsmi_get_gpu_accelerator_partition_profile`(*see note below). +***Note: *Subject to change for ROCm 6.4*** + +- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**. +This aligns BDF output with ROCm SMI. +See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function. + - bits [63:32] = domain + - bits [31:28] = partition id + - bits [27:16] = reserved + - bits [15: 0] = pci bus/device/function + +- **Moved python tests directory path install location**. + - `/opt//share/amd_smi/pytest/..` to `/opt//share/amd_smi/tests/python_unittest/..` + - On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed + - Removed pytest dependency, our python testing now only depends on the unittest framework. + +- **Updated Partition APIs and struct information and added and partition_id to `amd-smi static --partition`**. + - As part of an overhaul to partition information, some partition information will be made available in the `amdsmi_accelerator_partition_profile_t`. + - This struct will be filled out by a new API, `amdsmi_get_gpu_accelerator_partition_profile()`. + - Future data from these APIs wil will eventually get added to `amd-smi partition`. + +```C +#define AMDSMI_MAX_ACCELERATOR_PROFILE 32 +#define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 +#define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 + +/** + * @brief Accelerator Partition. This enum is used to identify + * various accelerator partitioning settings. + */ +typedef enum { + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory +} amdsmi_accelerator_partition_type_t; + +/** + * @brief Possible Memory Partition Modes. + * This union is used to identify various memory partitioning settings. + */ +typedef union { + struct { + uint32_t nps1_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps2_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps4_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t nps8_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported + uint32_t reserved :28; + } amdsmi_nps_flags_t; + + uint32_t nps_cap_mask; +} amdsmi_nps_caps_t; + +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, length of resources array + uint32_t profile_index; + amdsmi_nps_caps_t memory_caps; // Possible memory partition capabilities + uint32_t num_resources; // length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint64_t reserved[6]; +} amdsmi_accelerator_partition_profile_t; +``` + +```shell +$ amd-smi static --partition +GPU: 0 + PARTITION: + COMPUTE_PARTITION: CPX + MEMORY_PARTITION: NPS4 + PARTITION_ID: 0 +``` + +### Removed + +- **Removed `amd-smi reset --compute-partition` and `... --memory-partition` and associated APIs**. + - This change is part of the partition redesign. Reset functionality will be reintroduced in a later update. + - associated APIs include `amdsmi_reset_gpu_compute_partition()` and `amdsmi_reset_gpu_memory_partition()` + +- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. + - This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) + +### Optimized + +- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**. + - With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie` + +### Resolved issues + +- **Fixed `amdsmi_get_gpu_asic_info`'s `target_graphics_version` and `amd-smi --asic` not displaying properly for MI2x or Navi 3x ASICs**. + +- **Fixed `amd-smi reset` commands showing an AttributeError**. + +- **Improved Offline install process & lowered dependency for PyYAML**. + +- **Fixed CPX not showing total number of logical GPUs**. + - Updates were made to `amdsmi_init()` and `amdsmi_get_gpu_bdf_id(..)`. In order to display all logical devices, we needed a way to provide order to GPU's enumerated. This was done by adding a partition_id within the BDF optional pci_id bits. + - Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `amdsmi_get_gpu_bdf_id(..)`, we provided this fallback to properly retreive partition ID. We +plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description. + + - bits [63:32] = domain + - bits [31:28] or bits [2:0] = partition id + - bits [27:16] = reserved + - bits [15:8] = Bus + - bits [7:3] = Device + - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + + - Previously in non-SPX modes (ex. CPX/TPX/DPX/etc) some MI3x ASICs would not report all logical GPU devices within AMD SMI. + +```shell +$ amd-smi monitor -p -t -v +GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL + 0 248 W 55 °C 48 °C 283 MB 196300 MB + 1 247 W 55 °C 48 °C 283 MB 196300 MB + 2 247 W 55 °C 48 °C 283 MB 196300 MB + 3 247 W 55 °C 48 °C 283 MB 196300 MB + 4 221 W 50 °C 42 °C 283 MB 196300 MB + 5 221 W 50 °C 42 °C 283 MB 196300 MB + 6 222 W 50 °C 42 °C 283 MB 196300 MB + 7 221 W 50 °C 42 °C 283 MB 196300 MB + 8 239 W 53 °C 46 °C 283 MB 196300 MB + 9 239 W 53 °C 46 °C 283 MB 196300 MB + 10 239 W 53 °C 46 °C 283 MB 196300 MB + 11 239 W 53 °C 46 °C 283 MB 196300 MB + 12 219 W 51 °C 48 °C 283 MB 196300 MB + 13 219 W 51 °C 48 °C 283 MB 196300 MB + 14 219 W 51 °C 48 °C 283 MB 196300 MB + 15 219 W 51 °C 48 °C 283 MB 196300 MB + 16 222 W 51 °C 47 °C 283 MB 196300 MB + 17 222 W 51 °C 47 °C 283 MB 196300 MB + 18 222 W 51 °C 47 °C 283 MB 196300 MB + 19 222 W 51 °C 48 °C 283 MB 196300 MB + 20 241 W 55 °C 48 °C 283 MB 196300 MB + 21 241 W 55 °C 48 °C 283 MB 196300 MB + 22 241 W 55 °C 48 °C 283 MB 196300 MB + 23 240 W 55 °C 48 °C 283 MB 196300 MB + 24 211 W 51 °C 45 °C 283 MB 196300 MB + 25 211 W 51 °C 45 °C 283 MB 196300 MB + 26 211 W 51 °C 45 °C 283 MB 196300 MB + 27 211 W 51 °C 45 °C 283 MB 196300 MB + 28 227 W 51 °C 49 °C 283 MB 196300 MB + 29 227 W 51 °C 49 °C 283 MB 196300 MB + 30 227 W 51 °C 49 °C 283 MB 196300 MB + 31 227 W 51 °C 49 °C 283 MB 196300 MB +``` + +- **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**. + +- **`amdsmitst` TestGpuMetricsRead now prints metric in correct units**. + +### Upcoming changes + +- **Python API for `amdsmi_get_energy_count()` will deprecate the `power` field in ROCm 6.4 and use `energy_accumulator` field instead**. + +- **New memory and compute partition APIs incoming for ROCm 6.4**. + - These APIs will be updated to fully populate the CLI and allowing compute (accelerator) partitions to be set by profile ID. + - One API will be provided, to reset both memory and compute (accelerator). + - There are dependencies regarding available compute partitions when in other memory modes. + - Driver will be providing these default modes + - Memory partition resets (for BM) require driver reloads - this will allow us to notify users before taking this action, then change to the default compute partition modes. + - The following APIs will remain: + +```C +amdsmi_status_t +amdsmi_set_gpu_compute_partition(amdsmi_processor_handle processor_handle, + amdsmi_compute_partition_type_t compute_partition); +amdsmi_status_t +amdsmi_get_gpu_compute_partition(amdsmi_processor_handle processor_handle, + char *compute_partition, uint32_t len); +amdsmi_status_t +amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, + + char *memory_partition, uint32_t len); +amdsmi_status_t +amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t memory_partition); +``` + +- **`amd-smi set --compute-partition` "SPX/DPX/CPX..." will modified to accept profile IDs in ROCm 6.4**. + - This is due to aligning with Host setups and providing more robust partition information through the APIs outlined above. Furthermore, new APIs which will be available on both BM/Host can set by profile ID. (functionality coming soon!) + +- **Added preliminary `amd-smi partition` command**. + - The new partition command can be used to display GPU information, including memory and accelerator partition information. + - The command will be at full functionality once additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()` has been implemented. + +## amd_smi_lib for ROCm 6.2.1 + +### Added + +- **Removed `amd-smi metric --ecc` & `amd-smi metric --ecc-blocks` on Guest VMs**. +Guest VMs do not support getting current ECC counts from the Host cards. + +- **Added `amd-smi static --ras`on Guest VMs**. +Guest VMs can view enabled/disabled ras features that are on Host cards. + +### Resolved issues + +- **Fixed TypeError in `amd-smi process -G`**. + +- **Updated CLI error strings to handle empty and invalid GPU/CPU inputs**. + +- **Fixed Guest VM showing passthrough options**. + +- **Fixed firmware formatting where leading 0s were missing**. + +## amd_smi_lib for ROCm 6.2.0 + +### Added + +- **`amd-smi dmon` is now available as an alias to `amd-smi monitor`**. + +- **Added optional process table under `amd-smi monitor -q`**. +The monitor subcommand within the CLI Tool now has the `-q` option to enable an optional process table underneath the original monitored output. + +```shell +$ amd-smi monitor -q +GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK SINGLE_ECC DOUBLE_ECC PCIE_REPLAY VRAM_USED VRAM_TOTAL PCIE_BW + 0 199 W 103 °C 84 °C 99 % 1920 MHz 31 % 1000 MHz N/A 0 MHz N/A 0 MHz 0 0 0 1235 MB 16335 MB N/A Mb/s + +PROCESS INFO: +GPU NAME PID GTT_MEM CPU_MEM VRAM_MEM MEM_USAGE GFX ENC + 0 rvs 1564865 0.0 B 0.0 B 1.1 GB 0.0 B 0 ns 0 ns +``` + +- **Added Handling to detect VMs with passthrough configurations in CLI Tool**. +CLI Tool had only allowed a restricted set of options for Virtual Machines with passthrough GPUs. Now we offer an expanded set of functions availble to passthrough configured GPUs. + +- **Added Process Isolation and Clear SRAM functionality to the CLI Tool for VMs**. +VMs now have the ability to set the process isolation and clear the sram from the CLI tool. Using the following commands + +```shell +amd-smi set --process-isolation <0 or 1> +amd-smi reset --clean_local_data +``` + +- **Added macros that were in `amdsmi.h` to the amdsmi Python library `amdsmi_interface.py`**. +Added macros to reference max size limitations for certain amdsmi functions such as max dpm policies and max fanspeed. + +- **Added Ring Hang event**. +Added `AMDSMI_EVT_NOTIF_RING_HANG` to the possible events in the `amdsmi_evt_notification_type_t` enum. + +### Optimized + +- **Updated CLI error strings to specify invalid device type queried** + +```shell +$ amd-smi static --asic --gpu 123123 +Can not find a device: GPU '123123' Error code: -3 +``` + +- **Removed elevated permission requirements for `amdsmi_get_gpu_process_list()`**. +Previously if a processes with elevated permissions was running amd-smi would required sudo to display all output. Now amd-smi will populate all process data and return N/A for elevated process names instead. However if ran with sudo you will be able to see the name like so: + +```shell +$ amd-smi process +GPU: 0 + PROCESS_INFO: + NAME: N/A + PID: 1693982 + MEMORY_USAGE: + GTT_MEM: 0.0 B + CPU_MEM: 0.0 B + VRAM_MEM: 10.1 GB + MEM_USAGE: 0.0 B + USAGE: + GFX: 0 ns + ENC: 0 ns +``` + +```shell +$ sudo amd-smi process +GPU: 0 + PROCESS_INFO: + NAME: TransferBench + PID: 1693982 + MEMORY_USAGE: + GTT_MEM: 0.0 B + CPU_MEM: 0.0 B + VRAM_MEM: 10.1 GB + MEM_USAGE: 0.0 B + USAGE: + GFX: 0 ns + ENC: 0 ns +``` + +- **Updated naming for `amdsmi_set_gpu_clear_sram_data()` to `amdsmi_clean_gpu_local_data()`**. +Changed the naming to be more accurate to what the function was doing. This change also extends to the CLI where we changed the `clear-sram-data` command to `clean_local_data`. + +- **Updated `amdsmi_clk_info_t` struct in amdsmi.h and amdsmi_interface.py to align with host/guest**. +Changed cur_clk to clk, changed sleep_clk to clk_deep_sleep, and added clk_locked value. New struct will be in the following format: + +```shell + typedef struct { ++ uint32_t clk; + uint32_t min_clk; + uint32_t max_clk; ++ uint8_t clk_locked; ++ uint8_t clk_deep_sleep; + uint32_t reserved[4]; + } amdsmi_clk_info_t; +``` + +- **Multiple structure updates in amdsmi.h and amdsmi_interface.py to align with host/guest**. +Multiple structures used by APIs were changed for alignment unification: + - Changed `amdsmi_vram_info_t` `vram_size_mb` field changed to to `vram_size` + - Updated `amdsmi_vram_type_t` struct updated to include new enums and added `AMDSMI` prefix + - Updated `amdsmi_status_t` some enums were missing the `AMDSMI_STATUS` prefix + - Added `AMDSMI_PROCESSOR_TYPE` prefix to `processor_type_t` enums + - Removed the fields structure definition in favor for an anonymous definition in `amdsmi_bdf_t` + +- **Added `AMDSMI` prefix in amdsmi.h and amdsmi_interface.py to align with host/guest**. +Multiple structures used by APIs were changed for alignment unification. `AMDSMI` prefix was added to the following structures: + - Added AMDSMI prefix to `amdsmi_container_types_t` enums + - Added AMDSMI prefix to `amdsmi_clk_type_t` enums + - Added AMDSMI prefix to `amdsmi_compute_partition_type_t` enums + - Added AMDSMI prefix to `amdsmi_memory_partition_type_t` enums + - Added AMDSMI prefix to `amdsmi_clk_type_t` enums + - Added AMDSMI prefix to `amdsmi_temperature_type_t` enums + - Added AMDSMI prefix to `amdsmi_fw_block_t` enums + +- **Changed dpm_policy references to soc_pstate**. +The file structure referenced to dpm_policy changed to soc_pstate and we have changed the APIs and CLI tool to be inline with the current structure. `amdsmi_get_dpm_policy()` and `amdsmi_set_dpm_policy()` is no longer valid with the new API being `amdsmi_get_soc_pstate()` and `amdsmi_set_soc_pstate()`. The CLI tool has been changed from `--policy` to `--soc-pstate` + +- **Updated `amdsmi_get_gpu_board_info()` product_name to fallback to pciids**. +Previously on devices without a FRU we would not populate the product name in the `amdsmi_board_info_t` structure, now we will fallback to using the name listed according to the pciids file if available. + +- **Updated CLI voltage curve command output**. +The output for `amd-smi metric --voltage-curve` now splits the frequency and voltage output by curve point or outputs N/A for each curve point if not applicable + +```shell +GPU: 0 + VOLTAGE_CURVE: + POINT_0_FREQUENCY: 872 Mhz + POINT_0_VOLTAGE: 736 mV + POINT_1_FREQUENCY: 1354 Mhz + POINT_1_VOLTAGE: 860 mV + POINT_2_FREQUENCY: 1837 Mhz + POINT_2_VOLTAGE: 1186 mV +``` + +- **Updated `amdsmi_get_gpu_board_info()` now has larger structure sizes for `amdsmi_board_info_t`**. +Updated sizes that work for retreiving relavant board information across AMD's +ASIC products. This requires users to update any ABIs using this structure. + +### Resolved issues + +- **Fixed Leftover Mutex deadlock when running multiple instances of the CLI tool**. +When running `amd-smi reset --gpureset --gpu all` and then running an instance of `amd-smi static` (or any other subcommand that access the GPUs) a mutex would lock and not return requiring either a clear of the mutex in /dev/shm or rebooting the machine. + +- **Fixed multiple processes not being registered in `amd-smi process` with json and csv format**. +Multiple process outputs in the CLI tool were not being registered correctly. The json output did not handle multiple processes and is now in a new valid json format: + +```shell +[ + { + "gpu": 0, + "process_list": [ + { + "process_info": { + "name": "TransferBench", + "pid": 420157, + "mem_usage": { + "value": 0, + "unit": "B" + } + } + }, + { + "process_info": { + "name": "rvs", + "pid": 420315, + "mem_usage": { + "value": 0, + "unit": "B" + } + } + } + ] + } +] +``` + +- **Removed `throttle-status` from `amd-smi monitor` as it is no longer reliably supported**. +Throttle status may work for older ASICs, but will be replaced with PVIOL and TVIOL metrics for future ASIC support. It remains a field in the gpu_metrics API and in `amd-smi metric --power`. + +- **`amdsmi_get_gpu_board_info()` no longer returns junk char strings**. +Previously if there was a partial failure to retrieve character strings, we would return +garbage output to users using the API. This fix intends to populate as many values as possible. +Then any failure(s) found along the way, `\0` is provided to `amdsmi_board_info_t` +structures data members which cannot be populated. Ensuring empty char string values. + +- **Fixed parsing of `pp_od_clk_voltage` within `amdsmi_get_gpu_od_volt_info`**. +The parsing of `pp_od_clk_voltage` was not dynamic enough to work with the dropping of voltage curve support on MI series cards. This propagates down to correcting the CLI's output `amd-smi metric --voltage-curve` to N/A if voltage curve is not enabled. + +### Known issues + +- **`amdsmi_get_gpu_process_isolation` and `amdsmi_clean_gpu_local_data` commands do no currently work and will be supported in a future release**. + +## amd_smi_lib for ROCm 6.1.2 + +### Added + +- **Added process isolation and clean shader APIs and CLI commands**. +Added APIs CLI and APIs to address LeftoverLocals security issues. Allowing clearing the sram data and setting process isolation on a per GPU basis. New APIs: + - `amdsmi_get_gpu_process_isolation()` + - `amdsmi_set_gpu_process_isolation()` + - `amdsmi_set_gpu_clear_sram_data()` + +- **Added `MIN_POWER` to output of `amd-smi static --limit`**. +This change helps users identify the range to which they can change the power cap of the GPU. The change is added to simplify why a device supports (or does not support) power capping (also known as overdrive). See `amd-smi set -g all --power-cap ` or `amd-smi reset -g all --power-cap`. + +```shell +$ amd-smi static --limit +GPU: 0 + LIMIT: + MAX_POWER: 203 W + MIN_POWER: 0 W + SOCKET_POWER: 203 W + SLOWDOWN_EDGE_TEMPERATURE: 100 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 105 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C + +GPU: 1 + LIMIT: + MAX_POWER: 213 W + MIN_POWER: 213 W + SOCKET_POWER: 213 W + SLOWDOWN_EDGE_TEMPERATURE: 109 °C + SLOWDOWN_HOTSPOT_TEMPERATURE: 110 °C + SLOWDOWN_VRAM_TEMPERATURE: 100 °C + SHUTDOWN_EDGE_TEMPERATURE: 114 °C + SHUTDOWN_HOTSPOT_TEMPERATURE: 115 °C + SHUTDOWN_VRAM_TEMPERATURE: 105 °C +``` + +### Optimized + +- **Updated `amd-smi monitor --pcie` output**. +The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: + +```shell +$ amd-smi monitor --pcie +GPU PCIE_BW + 0 26 Mb/s +``` + +- **`amdsmi_get_power_cap_info` now returns values in uW instead of W**. +`amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**. +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. + +- **Updated `amd-smi metric --ecc-blocks` output**. +The ecc blocks argument was outputing blocks without counters available, updated the filtering show blocks that counters are available for: + +``` shell +$ amd-smi metric --ecc-block +GPU: 0 + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + SDMA: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + GFX: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + MMHUB: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + PCIE_BIF: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + HDP: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + XGMI_WAFL: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 +``` + +- **Removed `amdsmi_get_gpu_process_info` from Python library**. +amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the Python interface. + +### Resolved issues + +- **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x**. +These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. +No change in any of our APIs. + +```shell +$ amd-smi metric --power +GPU: 0 + POWER: + SOCKET_POWER: 11 W + GFX_VOLTAGE: 768 mV + SOC_VOLTAGE: 925 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED + +GPU: 1 + POWER: + SOCKET_POWER: 17 W + GFX_VOLTAGE: 781 mV + SOC_VOLTAGE: 806 mV + MEM_VOLTAGE: 1250 mV + POWER_MANAGEMENT: ENABLED + THROTTLE_STATUS: UNTHROTTLED +``` + +- **Fixed `amdsmitstReadWrite.TestPowerCapReadWrite` test for Navi3X, Navi2X, MI100**. +Updates required `amdsmi_get_power_cap_info` to return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). + +- **Fixed Python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**. +Previously Python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. + +## amd_smi_lib for ROCm 6.1.1 + +### Changed + +- **Updated metrics --clocks**. +Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. + +``` shell +$ amd-smi metric --clock +GPU: 0 + CLOCK: + GFX_0: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_1: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 112 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1200 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED +``` + +- **Added deferred ecc counts**. +Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks` + +```shell +$ amd-smi metric --ecc --ecc-blocks +GPU: 0 + ECC: + TOTAL_CORRECTABLE_COUNT: 0 + TOTAL_UNCORRECTABLE_COUNT: 0 + TOTAL_DEFERRED_COUNT: 0 + CACHE_CORRECTABLE_COUNT: 0 + CACHE_UNCORRECTABLE_COUNT: 0 + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + SDMA: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + ... +``` + +- **Updated `amd-smi topology --json` to align with host/guest**. +Topology's `--json` output now is changed to align with output host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. + +*Previous format:* + +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "link_accessibility": { + "gpu_0": "ENABLED", + "gpu_1": "DISABLED" + }, + "weight": { + "gpu_0": 0, + "gpu_1": 40 + }, + "hops": { + "gpu_0": 0, + "gpu_1": 2 + }, + "link_type": { + "gpu_0": "SELF", + "gpu_1": "PCIE" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + }, + { + "gpu": 1, + "link_accessibility": { + "gpu_0": "DISABLED", + "gpu_1": "ENABLED" + }, + "weight": { + "gpu_0": 40, + "gpu_1": 0 + }, + "hops": { + "gpu_0": 2, + "gpu_1": 0 + }, + "link_type": { + "gpu_0": "PCIE", + "gpu_1": "SELF" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + } +] +``` + +*New format:* + +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "weight": 0, + "link_status": "ENABLED", + "link_type": "SELF", + "num_hops": 0, + "bandwidth": "N/A", + }, + { + "gpu": 1, + "bdf": "0001:01:00.0", + "weight": 15, + "link_status": "ENABLED", + "link_type": "XGMI", + "num_hops": 1, + "bandwidth": "50000-100000", + }, + ... + ] + }, + ... +] +``` + +```shell +$ /opt/rocm/bin/amd-smi topology -a -t --json +[ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + } + ] + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + } + ] + } +] +``` + +### Resolved issues + +- **Fix for GPU reset error on non-amdgpu cards**. +Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix +updates CLI to target only AMD ASICs. + +- **Fix for `amd-smi static --pcie` and `amdsmi_get_pcie_info()` Navi32/31 cards**. +Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix +provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). + +- **Fix for `amd-smi process`**. +Fixed output results when getting processes running on a device. + +- **Improved Error handling for `amd-smi process`**. +Fixed Attribute Error when getting process in csv format + +### Known issues + +- `amd-smi bad-pages` can results with "ValueError: NULL pointer access" with certain PM FW versions. + +## amd_smi_lib for ROCm 6.1.0 + +### Added + +- **Added Monitor Command**. +Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. + +```shell +$ amd-smi monitor -h +usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] + [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n] + [-d] [-s] [-e] [-v] [-r] + +Monitor a target device for the specified arguments. +If no arguments are provided, all arguments will be enabled. +Use the watch arguments to run continuously + +Monitor Arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:01:00.0 | UUID: + all | Selects all devices + -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: + ID: 0 + all | Selects all devices + -O, --core CORE [CORE ...] Select a Core ID from the possible choices: + ID: 0 - 23 + all | Selects all devices + -w, --watch INTERVAL Reprint the command in a loop of INTERVAL seconds + -W, --watch_time TIME The total TIME to watch the given command + -i, --iterations ITERATIONS Total number of ITERATIONS to loop on the given command + -p, --power-usage Monitor power usage in Watts + -t, --temperature Monitor temperature in Celsius + -u, --gfx Monitor graphics utilization (%) and clock (MHz) + -m, --mem Monitor memory utilization (%) and clock (MHz) + -n, --encoder Monitor encoder utilization (%) and clock (MHz) + -d, --decoder Monitor decoder utilization (%) and clock (MHz) + -s, --throttle-status Monitor thermal throttle status + -e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts + -v, --vram-usage Monitor memory usage in MB + -r, --pcie Monitor PCIe Tx/Rx in MB/s + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +```shell +$ amd-smi monitor -ptumv +GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_USED VRAM_TOTAL + 0 171 W 32 °C 33 °C 0 % 114 MHz 0 % 900 MHz 283 MB 196300 MB + 1 175 W 33 °C 34 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 2 177 W 31 °C 33 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 3 172 W 33 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 4 178 W 32 °C 32 °C 0 % 113 MHz 0 % 900 MHz 284 MB 196300 MB + 5 176 W 33 °C 35 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 6 176 W 32 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 7 175 W 34 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB +``` + +- **Integrated ESMI Tool**. +Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: + - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh + - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh + + See a few examples listed below. + +```shell +$ amd-smi static -U all +CPU: 0 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 +``` + +```shell +$ amd-smi metric -O 0 1 2 +CORE: 0 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A + +CORE: 1 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A + +CORE: 2 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A +``` + +```shell +$ amd-smi metric -U all +CPU: 0 + POWER_METRICS: + SOCKET POWER: 102675 mW + SOCKET POWER LIMIT: 550000 mW + SOCKET MAX POWER LIMIT: 550000 mW + PROCHOT: + PROCHOT_STATUS: 0 + FREQ_METRICS: + FCLKMEMCLK: + FCLK: 2000 MHz + MCLK: 1300 MHz + CCLKFREQLIMIT: 400 MHz + SOC_CURRENT_ACTIVE_FREQ_LIMIT: + FREQ: 400 MHz + FREQ_SRC: [HSMP Agent] + SOC_FREQ_RANGE: + MAX_SOCKET_FREQ: 3700 MHz + MIN_SOCKET_FREQ: 400 MHz + C0_RESIDENCY: + RESIDENCY: 4 % + SVI_TELEMETRY_ALL_RAILS: + POWER: 102673 mW + METRIC_VERSION: + VERSION: 11 + METRICS_TABLE: + CPU_FAMILY: 25 + CPU_MODEL: 144 + RESPONSE: + MTBL_ACCUMULATION_COUNTER: 2887162626 + MTBL_MAX_SOCKET_TEMPERATURE: 41.0 °C + MTBL_MAX_VR_TEMPERATURE: 39.0 °C + MTBL_MAX_HBM_TEMPERATURE: 40.0 °C + MTBL_MAX_SOCKET_TEMPERATURE_ACC: 108583340881.125 °C + MTBL_MAX_VR_TEMPERATURE_ACC: 109472702595.0 °C + MTBL_MAX_HBM_TEMPERATURE_ACC: 111516663941.0 °C + MTBL_SOCKET_POWER_LIMIT: 550.0 W + MTBL_MAX_SOCKET_POWER_LIMIT: 550.0 W + MTBL_SOCKET_POWER: 102.678 W + MTBL_TIMESTAMP_RAW: 288731677361880 + MTBL_TIMESTAMP_READABLE: Tue Mar 19 12:32:21 2024 + MTBL_SOCKET_ENERGY_ACC: 166127.84 kJ + MTBL_CCD_ENERGY_ACC: 3317.837 kJ + MTBL_XCD_ENERGY_ACC: 21889.147 kJ + MTBL_AID_ENERGY_ACC: 121932.397 kJ + MTBL_HBM_ENERGY_ACC: 18994.108 kJ + MTBL_CCLK_FREQUENCY_LIMIT: 3.7 GHz + MTBL_GFXCLK_FREQUENCY_LIMIT: 0.0 MHz + MTBL_FCLK_FREQUENCY: 1999.988 MHz + MTBL_UCLK_FREQUENCY: 1299.993 MHz + MTBL_SOCCLK_FREQUENCY: [35.716, 35.715, 35.714, 35.714] MHz + MTBL_VCLK_FREQUENCY: [0.0, 53.749, 53.749, 53.749] MHz + MTBL_DCLK_FREQUENCY: [7.143, 44.791, 44.791, 44.791] MHz + MTBL_LCLK_FREQUENCY: [20.872, 18.75, 35.938, 599.558] MHz + MTBL_FCLK_FREQUENCY_TABLE: [1200.0, 1600.0, 1900.0, 2000.0] MHz + MTBL_UCLK_FREQUENCY_TABLE: [900.0, 1100.0, 1200.0, 1300.0] MHz + MTBL_SOCCLK_FREQUENCY_TABLE: [800.0, 1000.0, 1142.857, 1142.857] MHz + MTBL_VCLK_FREQUENCY_TABLE: [914.286, 1300.0, 1560.0, 1720.0] MHz + MTBL_DCLK_FREQUENCY_TABLE: [711.111, 975.0, 1300.0, 1433.333] MHz + MTBL_LCLK_FREQUENCY_TABLE: [600.0, 844.444, 1150.0, 1150.0] MHz + MTBL_CCLK_FREQUENCY_ACC: [4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] GHz + MTBL_GFXCLK_FREQUENCY_ACC: [0.0, 0.0, 250534397827.603, 251546257401.82, 250811364089.836, + 249999070486.505, 251622633562.855, 251342375116.05] MHz + MTBL_GFXCLK_FREQUENCY: [0.0, 0.0, 31.091, 31.414, 31.141, 31.478, 31.32, 31.453] + MHz + MTBL_MAX_CCLK_FREQUENCY: 3.7 GHz + MTBL_MIN_CCLK_FREQUENCY: 0.4 GHz + MTBL_MAX_GFXCLK_FREQUENCY: 2100.0 MHz + MTBL_MIN_GFXCLK_FREQUENCY: 500.0 MHz + MTBL_MAX_LCLK_DPM_RANGE: 2 + MTBL_MIN_LCLK_DPM_RANGE: 0 + MTBL_XGMI_WIDTH: 0.0 + MTBL_XGMI_BITRATE: 0.0 Gbps + MTBL_XGMI_READ_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps + MTBL_XGMI_WRITE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps + MTBL_SOCKET_C0_RESIDENCY: 4.329 % + MTBL_SOCKET_GFX_BUSY: 0.0 % + MTBL_HBM_BANDWIDTH_UTILIZATION: 0.001 % + MTBL_SOCKET_C0_RESIDENCY_ACC: 311523106.34 + MTBL_SOCKET_GFX_BUSY_ACC: 84739.281 + MTBL_HBM_BANDWIDTH_ACC: 33231180.073 Gbps + MTBL_MAX_HBM_BANDWIDTH: 5324.801 Gbps + MTBL_DRAM_BANDWIDTH_UTILIZATION_ACC: 612843.699 + MTBL_PCIE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0] Gbps + MTBL_PROCHOT_RESIDENCY_ACC: 0 + MTBL_PPT_RESIDENCY_ACC: 2887162626 + MTBL_SOCKET_THM_RESIDENCY_ACC: 2887162626 + MTBL_VR_THM_RESIDENCY_ACC: 0 + MTBL_HBM_THM_RESIDENCY_ACC: 2887162626 + SOCKET_ENERGY: + RESPONSE: N/A + DDR_BANDWIDTH: + RESPONSE: N/A + CPU_TEMP: + RESPONSE: N/A +``` + +- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**. +Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. + +```shell +$ amd-smi metric -P +GPU: 0 + PCIE: + WIDTH: 16 + SPEED: 16 GT/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A +``` + +```shell +$ amd-smi metric --usage +GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 + %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, + 0 %, 0 %, 0 %, 0 %] + +``` + +- **Added AMDSMI Tool Version**. +AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version. +The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign. +The AMDSMI Library version is the library package version number. +The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A. + +```shell +$ amd-smi version +AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 +``` + +- **Added XGMI table**. +Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). + +```shell +$ amd-smi xgmi +LINK METRIC TABLE: + bdf bit_rate max_bandwidth link_type 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +GPU0 0000:0c:00.0 32 Gb/s 512 Gb/s XGMI + Read N/A 2 KB 2 KB 1 KB 2 KB 1 KB 2 KB 2 KB + Write N/A 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB +GPU1 0000:22:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB N/A 2 KB 2 KB 1 KB 2 KB 1 KB 2 KB + Write 0 KB N/A 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB +GPU2 0000:38:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 1 KB N/A 2 KB 1 KB 2 KB 0 KB 0 KB + Write 0 KB 1 KB N/A 1 KB 1 KB 1 KB 1 KB 1 KB +GPU3 0000:5c:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 2 KB N/A 1 KB 0 KB 0 KB 2 KB + Write 0 KB 1 KB 1 KB N/A 1 KB 1 KB 1 KB 1 KB +GPU4 0000:9f:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 1 KB 0 KB 0 KB N/A 2 KB 0 KB 2 KB + Write 0 KB 1 KB 1 KB 1 KB N/A 1 KB 1 KB 1 KB +GPU5 0000:af:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 2 KB 0 KB 0 KB 0 KB N/A 2 KB 0 KB + Write 0 KB 1 KB 1 KB 1 KB 1 KB N/A 1 KB 1 KB +GPU6 0000:bf:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB N/A 0 KB + Write 0 KB 1 KB 1 KB 1 KB 1 KB 1 KB N/A 1 KB +GPU7 0000:df:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB N/A + Write 0 KB 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB N/A + +``` + +- **Added units of measure to JSON output**. +We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands. + +Ex. + +```shell +amd-smi metric -p --json +[ + { + "gpu": 0, + "power": { + "socket_power": { + "value": 10, + "unit": "W" + }, + "gfx_voltage": { + "value": 6, + "unit": "mV" + }, + "soc_voltage": { + "value": 918, + "unit": "mV" + }, + "mem_voltage": { + "value": 1250, + "unit": "mV" + }, + "power_management": "ENABLED", + "throttle_status": "UNTHROTTLED" + } + } +] +``` + +### Changed + +- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns**. +We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. + +```shell +$ amd-smi topology +ACCESS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:22:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:38:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:5c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:9f:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:af:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:bf:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:df:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED + +WEIGHT TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 0 15 15 15 15 15 15 15 +0000:22:00.0 15 0 15 15 15 15 15 15 +0000:38:00.0 15 15 0 15 15 15 15 15 +0000:5c:00.0 15 15 15 0 15 15 15 15 +0000:9f:00.0 15 15 15 15 0 15 15 15 +0000:af:00.0 15 15 15 15 15 0 15 15 +0000:bf:00.0 15 15 15 15 15 15 0 15 +0000:df:00.0 15 15 15 15 15 15 15 0 + +HOPS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 0 1 1 1 1 1 1 1 +0000:22:00.0 1 0 1 1 1 1 1 1 +0000:38:00.0 1 1 0 1 1 1 1 1 +0000:5c:00.0 1 1 1 0 1 1 1 1 +0000:9f:00.0 1 1 1 1 0 1 1 1 +0000:af:00.0 1 1 1 1 1 0 1 1 +0000:bf:00.0 1 1 1 1 1 1 0 1 +0000:df:00.0 1 1 1 1 1 1 1 0 + +LINK TYPE TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF XGMI XGMI XGMI XGMI XGMI XGMI XGMI +0000:22:00.0 XGMI SELF XGMI XGMI XGMI XGMI XGMI XGMI +0000:38:00.0 XGMI XGMI SELF XGMI XGMI XGMI XGMI XGMI +0000:5c:00.0 XGMI XGMI XGMI SELF XGMI XGMI XGMI XGMI +0000:9f:00.0 XGMI XGMI XGMI XGMI SELF XGMI XGMI XGMI +0000:af:00.0 XGMI XGMI XGMI XGMI XGMI SELF XGMI XGMI +0000:bf:00.0 XGMI XGMI XGMI XGMI XGMI XGMI SELF XGMI +0000:df:00.0 XGMI XGMI XGMI XGMI XGMI XGMI XGMI SELF + +NUMA BW TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:22:00.0 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:38:00.0 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:5c:00.0 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 +0000:9f:00.0 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 +0000:af:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 +0000:bf:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 +0000:df:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A +``` + +### Resolved issues + +- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**. +Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string. +- **Fix for devices which have an older pyyaml installed**. +Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: + - `amd-smi list` + - `amd-smi static` + - `amd-smi firmware` + - `amd-smi metric` + - `amd-smi topology` + +```shell +TypeError: dump_all() got an unexpected keyword argument 'sort_keys' +``` + +- **Fix for crash when user is not a member of video/render groups**. +AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user. + +## amd_smi_lib for ROCm 6.0.0 + +### Added + +- **Integrated the E-SMI (EPYC-SMI) library**. +You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details. + +- **Added support for gfx942 metrics**. +You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. + +- **Compute and memory partition support**. +Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. + +### Changed + +- **GPU index sorting made consistent with other tools**. +To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number. +- **Topology output is now aligned with GPU BDF table**. +Earlier versions of the topology output were difficult to read since each GPU was displayed linearly. +Now the information is displayed as a table by each GPU's BDF, which closer resembles rocm-smi output. + +### Optimized + +- Updated to C++17, gtest-1.14, and cmake 3.14 + +### Resolved issues + +- **Fix for driver not initialized**. +If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded. diff --git a/projects/amdsmi/CMakeLists.txt b/projects/amdsmi/CMakeLists.txt new file mode 100644 index 0000000000..444ec4e56a --- /dev/null +++ b/projects/amdsmi/CMakeLists.txt @@ -0,0 +1,511 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# +# Minimum version of cmake required +# +cmake_minimum_required(VERSION 3.20) + +set(AMD_SMI "amd_smi") +set(AMD_SMI_LIBS_TARGET "${AMD_SMI}_lib") +set(CPACK_PACKAGE_NAME amd-smi-lib CACHE STRING "") + +set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or not.") + +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/" CACHE INTERNAL "Default module path.") +## Include common cmake modules +include(utils) +include(help_package) +find_package(PkgConfig) + +generic_add_rocm() + +# provide git to utilities +find_program(GIT NAMES git) + +## Setup the package version based on git tags. +set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver") +get_version_from_file("include/amd_smi/amdsmi.h" "MAJOR") +get_version_from_file("include/amd_smi/amdsmi.h" "MINOR") +get_version_from_file("include/amd_smi/amdsmi.h" "RELEASE") +set(DEFAULT_VERSION "${MAJOR}.${MINOR}.${RELEASE}") +get_package_version_number(${DEFAULT_VERSION} ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +message("Package version: ${PKG_VERSION_STR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_BUILD "0") +set(${AMD_SMI_LIBS_TARGET}_VERSION_HASH "${PKG_VERSION_HASH}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_STRING + "${${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR}.${${AMD_SMI_LIBS_TARGET}_VERSION_MINOR}.${${AMD_SMI_LIBS_TARGET}_VERSION_PATCH}+${${AMD_SMI_LIBS_TARGET}_VERSION_HASH}" +) + +set(DEFAULT_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") +update_version_in_file("include/amd_smi/amdsmi.h" ${DEFAULT_VERSION} "#define AMDSMI_LIB_VERSION_" " *" " ") +update_version_in_file("rust-interface/src/amdsmi_wrapper.rs" ${DEFAULT_VERSION} "AMDSMI_LIB_VERSION_" " *: *u32 *= *" + ": u32 = ") + +# Make proper version for appending +# Default Value is 99999 +set(ROCM_VERSION_FOR_PACKAGE "99999") +if(DEFINED ENV{ROCM_LIBPATCH_VERSION}) + set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION}) +endif() +#Prepare final version for the CPACK use +set(CPACK_PACKAGE_VERSION + "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}" +) + +# The following default version values should be updated as appropriate for +# ABI breaks (update MAJOR and MINOR), and ABI/API additions (update MINOR). +# Until ABI stabilizes VERSION_MAJOR will be 0. This should be over-ridden +# by git tags (through "git describe") when they are present. +set(PKG_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") +set(PKG_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") +set(PKG_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}") +set(PKG_VERSION_NUM_COMMIT 0) + +project(${AMD_SMI_LIBS_TARGET} DESCRIPTION "AMD System Management libraries" + HOMEPAGE_URL "https://github.com/ROCm/amdsmi") + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +# Link with stdc++fs for filesystem support (only for GCC < 9.0) +set(FILESYSTEM_LIB "") +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) + set(FILESYSTEM_LIB stdc++fs) + message(STATUS "GCC ${CMAKE_CXX_COMPILER_VERSION} detected, linking with stdc++fs for filesystem support") + endif() +endif() + +include(GNUInstallDirs) + +option(BUILD_TESTS "Build test suite" OFF) +option(ENABLE_ASAN_PACKAGING "" OFF) +option(ENABLE_ESMI_LIB "Build ESMI Library" ON) +option(BUILD_EXAMPLES "Build examples" OFF) + +# If amdsmi is built as a static library, it should support being embedded in other programs. The setting below essentially enables the -fPIC flag. +set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Enable position independent code for all targets") +mark_as_advanced(CMAKE_POSITION_INDEPENDENT_CODE) + +include(CMakeDependentOption) +# these options don't work without BUILD_SHARED_LIBS +cmake_dependent_option(BUILD_WRAPPER "Rebuild AMDSMI-wrapper" OFF "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(BUILD_CLI "Build AMDSMI-CLI and install" ON "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(BUILD_RUST_WRAPPER "Build rust wrapper and install" OFF "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(ENABLE_LDCONFIG "Set library links and caches using ldconfig." ON "BUILD_SHARED_LIBS" OFF) + +# Set share path here because project name != amd_smi +set(SHARE_INSTALL_PREFIX "${CMAKE_INSTALL_DATAROOTDIR}/${AMD_SMI}" CACHE STRING "Tests and Example install directory") + +# Packaging directives +set(CPACK_PACKAGE_CONTACT "AMD-SMILib Support " CACHE STRING "") + +generic_package() + +# Dependencies +find_package(Threads REQUIRED) +pkg_check_modules(DRM REQUIRED IMPORTED_TARGET libdrm) +pkg_check_modules(DRM_AMDGPU REQUIRED IMPORTED_TARGET libdrm_amdgpu) + +# Configuration +function(get_imported_soname target out_var) + get_target_property(link_libs ${target} INTERFACE_LINK_LIBRARIES) + set(result) + foreach(link_lib ${link_libs}) + if(result) + message(FATAL_ERROR "Target ${target} has multiple link libraries: ${link_libs}") + endif() + execute_process( + COMMAND objdump -p "${link_lib}" + OUTPUT_VARIABLE OBJDUMP_OUTPUT + RESULT_VARIABLE OBJDUMP_RESULT + ) + if(OBJDUMP_RESULT EQUAL 0) + string(REGEX MATCH "SONAME +([^ \n]+)" SONAME_MATCH "${OBJDUMP_OUTPUT}") + if(SONAME_MATCH) + set(SONAME_OF_MY_PKG "${CMAKE_MATCH_1}") + message(STATUS "SONAME of my_package_name: ${SONAME_OF_MY_PKG}") + else() + message(FATAL_ERROR "Could not find SONAME in objdump output for ${link_lib}") + endif() + set(result "${SONAME_OF_MY_PKG}") + else() + message(FATAL_ERROR "objdump failed for ${link_lib}") + endif() + endforeach() + if(NOT result) + message(FATAL_ERROR "Could not find SONAME for target ${target} libs: ${link_libs}") + endif() + set("${out_var}" "${result}" PARENT_SCOPE) +endfunction() + +get_imported_soname(PkgConfig::DRM_AMDGPU LIBDRM_AMDGPU_SONAME) +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/include/config/amd_smi_config.h.in" + "${CMAKE_CURRENT_BINARY_DIR}/include/config/amd_smi_config.h" + @ONLY +) + +## Compiler flags +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti") +if(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") +endif() +# Security options +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wcast-align") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat=2 -fno-common -Wstrict-overflow") +# Intentionally leave out -Wsign-promo. It causes spurious warnings. +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Woverloaded-virtual -Wreorder") + +set(ROCM_SRC_DIR "${PROJECT_SOURCE_DIR}/rocm_smi/src") +set(ROCM_INC_DIR "${PROJECT_SOURCE_DIR}/rocm_smi/include/rocm_smi") +set(SHR_MUTEX_DIR "${PROJECT_SOURCE_DIR}/third_party/shared_mutex") +if(ENABLE_ESMI_LIB) + # Supported esmi library version tag + set(current_esmi_tag "esmi_pkg_ver-4.2") + + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/esmi_ib_library/src) + # TODO: use ExternalProject_Add instead or a submodule + message(STATUS "Adding esmi_ib_library...") + execute_process(COMMAND git clone --depth=1 -b ${current_esmi_tag} https://github.com/amd/esmi_ib_library.git + ${PROJECT_SOURCE_DIR}/esmi_ib_library) + else() + message(STATUS "esmi_ib_library already installed, checking version...") + + # Grab latest commit and get the tag + execute_process( + COMMAND git rev-list --tags --max-count=1 + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/esmi_ib_library + OUTPUT_VARIABLE latest_commit + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND git describe --tags ${latest_commit} --match "*pkg*" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/esmi_ib_library + OUTPUT_VARIABLE latest_esmi_tag + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Update to latest tags if not matched + if(NOT latest_esmi_tag STREQUAL current_esmi_tag) + message(STATUS "Updating esmi_ib_library...") + execute_process( + COMMAND git clone --depth=1 -b ${current_esmi_tag} https://github.com/amd/esmi_ib_library.git + ${PROJECT_SOURCE_DIR}/esmi_ib_library_temp RESULT_VARIABLE clone_result) + if(clone_result EQUAL 0) + file(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/esmi_ib_library) + file(RENAME ${PROJECT_SOURCE_DIR}/esmi_ib_library_temp ${PROJECT_SOURCE_DIR}/esmi_ib_library) + message(STATUS "Successfully cloned updated esmi_ib_library") + else() + file(REMOVE_RECURSE ${PROJECT_SOURCE_DIR}/esmi_ib_library_temp) + message(FATAL_ERROR "Failed to clone updated esmi_ib_library") + endif() + else() + message(STATUS "esmi_ib_library is the latest version: ${current_esmi_tag}...") + endif() + endif() + + # Make sure to update the amd_hsmp.h file with the corresponding esmi version + file(COPY "${PROJECT_SOURCE_DIR}/include/amd_smi/impl/amd_hsmp.h" + DESTINATION "${PROJECT_SOURCE_DIR}/esmi_ib_library/include/asm") + + add_definitions("-DENABLE_ESMI_LIB=1") + set(ESMI_INC_DIR "${PROJECT_SOURCE_DIR}/esmi_ib_library/include") + set(ESMI_SRC_DIR "${PROJECT_SOURCE_DIR}/esmi_ib_library/src") + # esmi has a lot of write-strings warnings - silence them + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings") +endif() + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_BINARY_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex + ${CMAKE_CURRENT_SOURCE_DIR}/include/amd_smi + ${ESMI_INC_DIR} +) + +set(CMN_SRC_LIST + "${ROCM_SRC_DIR}/rocm_smi_device.cc" + "${ROCM_SRC_DIR}/rocm_smi_main.cc" + "${ROCM_SRC_DIR}/rocm_smi_monitor.cc" + "${ROCM_SRC_DIR}/rocm_smi_power_mon.cc" + "${ROCM_SRC_DIR}/rocm_smi_utils.cc" + "${ROCM_SRC_DIR}/rocm_smi_counters.cc" + "${ROCM_SRC_DIR}/rocm_smi_kfd.cc" + "${ROCM_SRC_DIR}/rocm_smi_io_link.cc" + "${ROCM_SRC_DIR}/rocm_smi_gpu_metrics.cc" + "${ROCM_SRC_DIR}/rocm_smi_dyn_gpu_metrics.cc" + "${ROCM_SRC_DIR}/rocm_smi.cc" + "${ROCM_SRC_DIR}/rocm_smi_logger.cc" + "${SHR_MUTEX_DIR}/shared_mutex.cc" + "${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc" + "${ROCM_SRC_DIR}/rocm_smi_board_temp.cc" + "${ROCM_SRC_DIR}/rocm_smi_npm.cc") + +if(ENABLE_ESMI_LIB) + list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c) + list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi_monitor.c) + list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi_plat.c) + list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi_utils.c) +endif() + +set(CMN_INC_LIST + "${ROCM_INC_DIR}/rocm_smi_device.h" + "${ROCM_INC_DIR}/rocm_smi_main.h" + "${ROCM_INC_DIR}/rocm_smi_monitor.h" + "${ROCM_INC_DIR}/rocm_smi_power_mon.h" + "${ROCM_INC_DIR}/rocm_smi_utils.h" + "${ROCM_INC_DIR}/rocm_smi_common.h" + "${ROCM_INC_DIR}/rocm_smi_exception.h" + "${ROCM_INC_DIR}/rocm_smi_counters.h" + "${ROCM_INC_DIR}/rocm_smi_kfd.h" + "${ROCM_INC_DIR}/rocm_smi_io_link.h" + "${ROCM_INC_DIR}/rocm_smi_gpu_metrics.h" + "${ROCM_INC_DIR}/rocm_smi_dyn_gpu_metrics.h" + "${ROCM_INC_DIR}/rocm_smi.h" + "${ROCM_INC_DIR}/rocm_smi_logger.h" + "${SHR_MUTEX_DIR}/shared_mutex.h" + "${ROCM_INC_DIR}/rocm_smi_binary_parser.h" + "${ROCM_INC_DIR}/rocm_smi_board_temp.h" + "${ROCM_INC_DIR}/rocm_smi_npm.h") + +add_subdirectory("rocm_smi") +add_subdirectory("src") + +if(BUILD_TESTS) + set(TESTS_COMPONENT "tests") + #add_subdirectory("tests/rocm_smi_test") + add_subdirectory("tests/amd_smi_test") + add_subdirectory("tests/python_unittest") +endif() + +# python interface, CLI, and py-test depend on shared libraries +if(BUILD_SHARED_LIBS) + add_subdirectory("py-interface") + if(BUILD_CLI) + add_subdirectory("amdsmi_cli") + endif() + if(BUILD_RUST_WRAPPER) + add_subdirectory("rust-interface") + endif() +endif() + +if(BUILD_EXAMPLES) + add_subdirectory("example") +endif() + +include(CMakePackageConfigHelpers) + +configure_package_config_file( + amd_smi-config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/amd_smi-config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${AMD_SMI} + PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR CMAKE_INSTALL_BINDIR) + +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/amd_smi-config-version.cmake + VERSION "${CPACK_PACKAGE_VERSION}" COMPATIBILITY SameMajorVersion) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/amd_smi-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/amd_smi-config-version.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${AMD_SMI} + COMPONENT dev) + +# Create cmake target +# Add all targets to the build-tree export set +export(TARGETS ${AMD_SMI} FILE "${PROJECT_BINARY_DIR}/amd_smi_target.cmake") + +# Export the package for use from the build-tree +# (this registers the build-tree with a global CMake-registry) +export(PACKAGE ${AMD_SMI}) + +install( + EXPORT amd_smiTargets + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${AMD_SMI} + COMPONENT dev) + +set(CPACK_RPM_PACKAGE_LICENSE "MIT") +if(ENABLE_ASAN_PACKAGING) + # install license file in share/doc/amd_smi-asan folder + install( + FILES ${CPACK_RESOURCE_FILE_LICENSE} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${CPACK_PACKAGE_NAME}-asan + RENAME LICENSE.txt + COMPONENT asan) +endif() +# docs are installed into different share directory from tests and examples +install( + FILES ${CPACK_RESOURCE_FILE_LICENSE} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${CPACK_PACKAGE_NAME} + RENAME LICENSE.txt + COMPONENT dev) + +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${CPACK_PACKAGE_NAME} + COMPONENT dev) +install( + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/example + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${AMD_SMI} + COMPONENT dev + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.cc" + PATTERN "*.txt" + PATTERN "build*" EXCLUDE + PATTERN ".cache*" EXCLUDE) + +# Make for goamdsmi_shim library +add_subdirectory(goamdsmi_shim) + +#Debian package specific variables +set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "python3-argcomplete, libdrm-dev, libdrm-amdgpu-dev") +set(CPACK_DEBIAN_ASAN_PACKAGE_RECOMMENDS ${CPACK_DEBIAN_PACKAGE_RECOMMENDS}) +set(CPACK_DEBIAN_DEV_PACKAGE_RECOMMENDS ${CPACK_DEBIAN_PACKAGE_RECOMMENDS}) +set(CPACK_DEBIAN_PACKAGE_DEPENDS "sudo, libc6, python3 (>= 3.6.8), python3-pip, python3-setuptools, python3-wheel") +set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) + +# $CURRENT_YEAR is used by copyright.in +string(TIMESTAMP CURRENT_YEAR "%Y") +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/copyright.in DEBIAN/copyright @ONLY) + +## Process the Debian install/remove scripts to update the CPACK variables +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in + DEBIAN/postinst + @ONLY + FILE_PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in + DEBIAN/prerm + @ONLY + FILE_PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE) +list(APPEND CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/postinst" + "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/prerm") + +# Configure pre-rm for tests only +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/${CPACK_PACKAGE_NAME}-tests/prerm.in + DEBIAN/${CPACK_PACKAGE_NAME}-tests/prerm + @ONLY + FILE_PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE) + +# Assign control scripts to the AMDSMI Lib & Tests packages +set(CPACK_DEBIAN_DEV_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/prerm") +set(CPACK_DEBIAN_TESTS_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/${CPACK_PACKAGE_NAME}-tests/prerm") + +# install copyright file into share/doc/amd-smi-lib/copyright +# required for debian package compliance +install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/copyright" + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${CPACK_PACKAGE_NAME} + COMPONENT dev) + +# RPM package specific variables +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION + "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") +#Set rpm distro +if(CPACK_RPM_PACKAGE_RELEASE) + set(CPACK_RPM_PACKAGE_RELEASE_DIST ON) +endif() +# NOTE: RPM SUGGESTS DO NOT WORK! https://bugzilla.redhat.com/show_bug.cgi?id=1811358 +set(CPACK_RPM_PACKAGE_SUGGESTS "python3-argcomplete, libdrm-dev, libdrm-amdgpu-dev") +set(CPACK_RPM_DEV_PACKAGE_SUGGESTS ${CPACK_RPM_PACKAGE_SUGGESTS}) +set(CPACK_RPM_ASAN_PACKAGE_SUGGESTS ${CPACK_RPM_PACKAGE_SUGGESTS}) +# python version gated by rhel8 :( +set(CPACK_RPM_PACKAGE_REQUIRES "sudo, python3 >= 3.6.8, python3-pip, python3-wheel, python3-setuptools") +set(CPACK_RPM_DEV_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) +set(CPACK_RPM_ASAN_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + +# don't terminate if bytecompile of python files fails +set(CPACK_RPM_SPEC_MORE_DEFINE "%define _python_bytecompile_errors_terminate_build 0") +# Cpack converts !/usr/bin/env python3 to /usr/libexec/platform-python in RHEL8. +# prevent the BRP(buildroot policy) script from checking and modifying interpreter directives +string(APPEND CPACK_RPM_SPEC_MORE_DEFINE "\n%undefine __brp_mangle_shebangs") + +# Add rocm-core dependency if -DROCM_DEP_ROCMCORE=ON is passed +if(ROCM_DEP_ROCMCORE) + string(APPEND CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ", rocm-core-asan") + string(APPEND CPACK_RPM_ASAN_PACKAGE_REQUIRES ", rocm-core-asan") + string(APPEND CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ", rocm-core") + string(APPEND CPACK_RPM_DEV_PACKAGE_REQUIRES ", rocm-core") + string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core") + string(APPEND CPACK_RPM_PACKAGE_REQUIRES ", rocm-core") +endif() + +## Enable Component Mode and set component specific flags +set(CPACK_DEB_COMPONENT_INSTALL ON) +set(CPACK_DEBIAN_DEV_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") +set(CPACK_DEBIAN_TESTS_PACKAGE_NAME "${CPACK_PACKAGE_NAME}-tests") +set(CPACK_DEBIAN_ASAN_PACKAGE_NAME "${CPACK_PACKAGE_NAME}-asan") +set(CPACK_RPM_COMPONENT_INSTALL ON) +set(CPACK_RPM_DEV_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") +set(CPACK_RPM_TESTS_PACKAGE_NAME "${CPACK_PACKAGE_NAME}-tests") +set(CPACK_RPM_ASAN_PACKAGE_NAME "${CPACK_PACKAGE_NAME}-asan") +if(ENABLE_ASAN_PACKAGING) + # ASAN Package requires only asan component with libraries and license file + set(CPACK_COMPONENTS_ALL asan) +else() + set(CPACK_COMPONENTS_ALL dev tests) +endif() + +# The line below doesn't currently work; it may be this issue: +# https://bugzilla.redhat.com/show_bug.cgi?id=1811358 +# set(CPACK_RPM_PACKAGE_SUGGESTS "sudo, libdrm-dev") + +## Process the Rpm install/remove scripts to update the CPACK variables +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY) +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/RPM/preun.in" RPM/preun @ONLY) +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY) +set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post") +set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/preun") +set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun") + +#Set the names now using CPACK utility +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") +set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") + +include(CPack) + +generic_package_post() diff --git a/projects/amdsmi/CPPLINT.cfg b/projects/amdsmi/CPPLINT.cfg new file mode 100644 index 0000000000..b63692c6df --- /dev/null +++ b/projects/amdsmi/CPPLINT.cfg @@ -0,0 +1,3 @@ +set noparent +linelength=100 +filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard diff --git a/projects/amdsmi/DEBIAN/amd-smi-lib-tests/prerm.in b/projects/amdsmi/DEBIAN/amd-smi-lib-tests/prerm.in new file mode 100755 index 0000000000..8fdb7097f7 --- /dev/null +++ b/projects/amdsmi/DEBIAN/amd-smi-lib-tests/prerm.in @@ -0,0 +1,127 @@ +#!/bin/bash + +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# Other prerm actions +rm_ldconfig() { + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf + ldconfig + fi +} + + +rm_leftovers() { + # remove pyc files generated by python + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amdsmi_cli/__pycache__" + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__" + + # remove build and egg files + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi.egg-info" + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/build" + + # remove leftover doc files + if test -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*"; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*" + fi +} + +rm_logFolder() { + rm -rf /var/log/amd_smi_lib +} + + +rm_rocm_tests_dir(){ + if [ -d "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" ]; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" + echo "Removed ROCm tests directory." + fi +} + + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi +} + +rm_python_lib() { + # get python version + local python3_minor_version + python3_minor_version=$(python3 -c 'import sys;print(sys.version_info.minor)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine python version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # check if python version is supported + if [ "$python3_minor_version" -lt 6 ]; then + echo "[WARNING] AMD-SMI python library is not supported on python version 3.$python3_minor_version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # Remove old python library + local pip_list_output + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + PIP_ROOT_USER_ACTION=ignore PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip uninstall amdsmi --yes --quiet --disable-pip-version-check + fi + + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + echo "[WARNING] AMD-SMI python library (amdsmi) is still installed in pip. "\ + "Check post install to ensure version is correct" + else + echo "Removed AMD-SMI python library (amdsmi)..." + fi +} + + +case "$1" in + ( remove | upgrade) + # remove old gpuv-smi symlink + rm -f @CPACK_PACKAGING_INSTALL_PREFIX@/bin/gpuv-smi &> /dev/null + echo "Removing AMDSMI Lib Tests Packages..." + rm_ldconfig + echo "ldconfig removed" + rm_leftovers + echo "leftovers removed" + ;; + ( purge ) + ;; + ( * ) + exit 0 + ;; +esac diff --git a/projects/amdsmi/DEBIAN/copyright.in b/projects/amdsmi/DEBIAN/copyright.in new file mode 100644 index 0000000000..4d6e35ee9c --- /dev/null +++ b/projects/amdsmi/DEBIAN/copyright.in @@ -0,0 +1,26 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: amdsmi +Source: https://github.com/ROCm/amdsmi.git + +Files: * +Copyright: @CURRENT_YEAR@ Advanced Micro Devices, Inc. +License: MIT + +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/projects/amdsmi/DEBIAN/postinst.in b/projects/amdsmi/DEBIAN/postinst.in new file mode 100755 index 0000000000..e6f55631d6 --- /dev/null +++ b/projects/amdsmi/DEBIAN/postinst.in @@ -0,0 +1,210 @@ +#!/bin/bash + +do_configureLogrotate() { + local IS_SYSTEMD=0 + local packageName="amd-smi-lib" + local logPath=/var/log/amd_smi_lib + local logFile="${logPath}/AMD-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + command -v logrotate &>/dev/null + if [ $? -ne 0 ]; then + echo "[WARNING] Detected logrotate is not installed."\ + "$packageName logs (when turned on) will not rotate properly." + return + fi + + if [ ! -f $logrotateConfFile ]; then + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read + # AMD SMI logging rotation, rotates files using root user/group + # Hourly logrotation check + # Only rotates if size grew larger than 1MB + # Max of 4 rotation files, oldest will be removed + # Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42 + cat << EOF > "${logrotateConfFile}" +${logFile} { + su root root + hourly + missingok + notifempty + rotate 4 + size 1M + copytruncate + dateext + dateformat .%%Y-%%m-%%d_%H:%%M:%%S +} +EOF + # Fix for % S argument not found (now we escape with %%) + # issue was RPM build thought we were using macros + # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 + # https://rpm-software-management.github.io/rpm/manual/spec.html + sed -i s/%%/%/g "${logrotateConfFile}" + # workaround: remove extra 'OURCE' text + # from amd_smi.conf. Unsure if CMAKE, + # bash, or here document + # issue (only seen on RHEL 8.7) + sed -i s/OURCE//g "${logrotateConfFile}" + fi + # check if logrotate uses system timers, Ubuntu/modern OS's do + # Several older OS's like RHEL 8.7, do not. Instead defaults + # to use daily cron jobs - see https://stackoverflow.com/a/69465677 + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -eq 0 ]; then + IS_SYSTEMD=1 + fi + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then + # Configure systemd timers - the typical setup for modern Linux logrotation setups + if [ -f /lib/systemd/system/logrotate.timer ]; then + if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + fi + cat << EOF > /lib/systemd/system/logrotate.timer +[Unit] +Description=Hourly rotation of log files +Documentation=man:logrotate(8) man:logrotate.conf(5) + +[Timer] +OnCalendar= +OnCalendar=hourly +AccuracySec=1m +Persistent=true + +[Install] +WantedBy=timers.target +EOF + systemctl reenable --now logrotate.timer + else + echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ + "$packageName logs (when turned on) will not rotate properly." + fi + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi + fi +} + +do_ldconfig() { + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf + ldconfig + fi +} + +do_install_amdsmi_python_lib() { + # get python version + local python3_minor_version + python3_minor_version=$(python3 -c 'import sys;print(sys.version_info.minor)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine python version. "\ + "AMD-SMI python library will not be installed." + return + fi + + # check if python version is supported + if [ "$python3_minor_version" -lt 6 ]; then + echo "[WARNING] AMD-SMI python library is not "\ + "supported on python version 3.$python3_minor_version. "\ + "AMD-SMI python library will not be installed." + return + fi + + local PREVIOUS_PIP_ROOT_USER_ACTION="$PIP_ROOT_USER_ACTION" + export PIP_ROOT_USER_ACTION=ignore + # python3.11 requires --break-system-packages + local PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES="$PIP_BREAK_SYSTEM_PACKAGES" + export PIP_BREAK_SYSTEM_PACKAGES=1 + + # Remove old python library + local amdsmi_pip_list_output + amdsmi_pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $amdsmi_pip_list_output == *"amdsmi"* ]]; then + echo "Detected old AMD-SMI python library (amdsmi)..." + python3 -m pip uninstall amdsmi --yes --quiet --disable-pip-version-check + echo "Removed old AMD-SMI python library (amdsmi)..." + fi + + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + + check_and_install_amdsmi() { + local setuptools_version + setuptools_version=$(python3 -c 'import setuptools; print(setuptools.__version__)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine setuptools version. "\ + "AMD-SMI python library will not be installed." + return + fi + + # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi + local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ + local amdsmi_python_lib_path="$python_lib_path" + local amdsmi_setup_py_path="$python_lib_path/setup.py" + + # Decide installation method based on setuptools version + if [[ "$(printf '%s\n' "$setuptools_version" "28.5" | sort -V | head -n1)" == "$setuptools_version" ]]; then + echo "[WARNING] Setuptools version is less than 28.5. AMD-SMI will not be installed." + elif [[ "$(printf '%s\n' "$setuptools_version" "41.0.1" | sort -V | head -n1)" != "41.0.1" ]]; then + echo "Using setup.py for installation due to setuptools version $setuptools_version" + python3 "$amdsmi_setup_py_path" install + else + echo "Using pyproject.toml for installation due to setuptools version $setuptools_version" + python3 -m pip install "$amdsmi_python_lib_path" --quiet --disable-pip-version-check --no-build-isolation --no-index + fi +} + + # Call the function + check_and_install_amdsmi + + export PIP_ROOT_USER_ACTION="$PREVIOUS_PIP_ROOT_USER_ACTION" + export PIP_BREAK_SYSTEM_PACKAGES="$PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES" + + # only try to activate argcomplete if such command exists + # python3-argcomplete is recommended but optional, we handle its absence gracefully + if command -v activate-global-python-argcomplete &>/dev/null; then + activate-global-python-argcomplete 2>/dev/null || { + echo "[INFO] Bash completion activation skipped. You can manually enable it with: activate-global-python-argcomplete" + } + else + # try older argcomplete3 version + if command -v activate-global-python-argcomplete3 &>/dev/null; then + activate-global-python-argcomplete3 2>/dev/null || { + echo "[INFO] Bash completion activation skipped. You can manually enable it with: activate-global-python-argcomplete3" + } + else + echo "[WARNING] Could not find argcomplete activation command. "\ + "Argument completion will not work. Install python3-argcomplete package to enable it." + fi + fi +} + + +case "$1" in + ( configure ) + do_install_amdsmi_python_lib + do_ldconfig + do_configureLogrotate || exit 0 + ;; + ( abort-upgrade | abort-remove | abort-deconfigure ) + echo "$1" + ;; + ( * ) + exit 0 + ;; +esac diff --git a/projects/amdsmi/DEBIAN/prerm.in b/projects/amdsmi/DEBIAN/prerm.in new file mode 100755 index 0000000000..71b86fa961 --- /dev/null +++ b/projects/amdsmi/DEBIAN/prerm.in @@ -0,0 +1,136 @@ +#!/bin/bash +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +echo "Removing AMDSMI LIB Packages..." + +# Other prerm actions +rm_ldconfig() { + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf + ldconfig + fi +} + + +rm_leftovers() { + # remove pyc files generated by python + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amdsmi_cli/__pycache__" + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__" + + # remove build and egg files + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi.egg-info" + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/build" + + # remove leftover doc files + if test -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*"; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*" + fi +} + +rm_logFolder() { + rm -rf /var/log/amd_smi_lib +} + + +rm_rocm_tests_dir(){ + if [ -d "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" ]; then + rm -rf "@CPACK_PACKAGING_INSTALL_PREFIX@/share/amd_smi/tests/" + echo "Removed ROCm tests directory." + fi +} + + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi +} + +rm_python_lib() { + # get python version + local python3_minor_version + python3_minor_version=$(python3 -c 'import sys;print(sys.version_info.minor)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine python version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # check if python version is supported + if [ "$python3_minor_version" -lt 6 ]; then + echo "[WARNING] AMD-SMI python library is not supported on python version 3.$python3_minor_version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # Remove old python library + local pip_list_output + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + PIP_ROOT_USER_ACTION=ignore PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip uninstall amdsmi --yes --quiet --disable-pip-version-check + fi + + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + echo "[WARNING] AMD-SMI python library (amdsmi) is still installed in pip. "\ + "Check post install to ensure version is correct" + else + echo "Removed AMD-SMI python library (amdsmi)..." + fi +} + + +case "$1" in + ( remove | upgrade) + # remove old gpuv-smi symlink + rm -f @CPACK_PACKAGING_INSTALL_PREFIX@/bin/gpuv-smi &> /dev/null + echo "Removing AMDSMI Lib Packages..." + rm_python_lib + echo "python library removed" + rm_ldconfig + echo "ldconfig removed" + rm_leftovers + echo "leftovers removed" + rm_logFolder + echo "log folder removed" + rm_rocm_tests_dir + echo "rocm tests directory removed" + return_logrotateToOrigConfig + echo "logrotate configuration restored" + ;; + ( purge ) + ;; + ( * ) + exit 0 + ;; +esac diff --git a/projects/amdsmi/DEBIAN/x86_64-libamd_smi_lib.conf b/projects/amdsmi/DEBIAN/x86_64-libamd_smi_lib.conf new file mode 100644 index 0000000000..a4551d4820 --- /dev/null +++ b/projects/amdsmi/DEBIAN/x86_64-libamd_smi_lib.conf @@ -0,0 +1 @@ +/opt/rocm/lib diff --git a/projects/amdsmi/Dockerfile b/projects/amdsmi/Dockerfile new file mode 100644 index 0000000000..6dbd6354a6 --- /dev/null +++ b/projects/amdsmi/Dockerfile @@ -0,0 +1,39 @@ +# Use rocm/dev-ubuntu-22.04 as the base image +FROM rocm/dev-ubuntu-22.04 + +# Set environment variables for build directories and package patterns +ENV BUILD_FOLDER=/home/amdsmi/build +ENV DEB_BUILD="amd-smi-lib*99999-local_amd64.deb" +ENV DEB_BUILD_TEST="amd-smi-lib-tests*99999-local_amd64.deb" + +# Set the working directory to /home +WORKDIR /home + +# Install necessary system packages +RUN apt update && apt-get install -y git build-essential rpm pkg-config g++ python3 python3-pip python3-wheel python3-setuptools + +# Upgrade pip and install cmake and virtualenv using pip +RUN python3 -m pip install --upgrade pip setuptools && \ + python3 -m pip install cmake virtualenv + +# Clone the AMD SMI repository from GitHub +RUN git clone -b amd-mainline https://github.com/ROCm/amdsmi.git + +# Navigate to the amdsmi directory +WORKDIR /home/amdsmi + +# Build and Install AMDSMI +RUN rm -rf ${BUILD_FOLDER} && \ + mkdir -p ${BUILD_FOLDER} && \ + cd ${BUILD_FOLDER} && \ + cmake .. -DBUILD_TESTS=ON -DENABLE_ESMI_LIB=ON && \ + make -j $(nproc) VERBOSE=1 && \ + make package && \ + sudo apt install -y --allow-downgrades ${BUILD_FOLDER}/${DEB_BUILD} && \ + sudo ln -s /opt/rocm/bin/amd-smi /usr/local/bin + +# Verify the installation of Python packages related to AMD SMI +RUN python3 -m pip list | grep -E "amd|pip|setuptools" + +# Set the entrypoint to bash for interactive use +ENTRYPOINT ["/bin/bash"] diff --git a/projects/amdsmi/LICENSE b/projects/amdsmi/LICENSE new file mode 100644 index 0000000000..3fe95c8e97 --- /dev/null +++ b/projects/amdsmi/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/projects/amdsmi/README.md b/projects/amdsmi/README.md new file mode 100644 index 0000000000..098c44c13e --- /dev/null +++ b/projects/amdsmi/README.md @@ -0,0 +1,221 @@ +# AMD System Management Interface (AMD SMI) library + +The AMD System Management Interface (AMD SMI) library offers a unified tool for managing and monitoring GPUs, +particularly in high-performance computing environments. It provides a user-space interface that allows applications to +control GPU operations, monitor performance, and retrieve information about the system's drivers and GPUs. + +For information on available features, installation steps, API reference material, and helpful tips, refer to the online +documentation at [rocm.docs.amd.com/projects/amdsmi](https://rocm.docs.amd.com/projects/amdsmi/en/latest/) + +>[!NOTE] +>This project is a successor to [rocm_smi_lib](https://github.com/ROCm/rocm_smi_lib) +>and [esmi_ib_library](https://github.com/amd/esmi_ib_library). +>This project is applicable to Linux Baremetal and Linux VM(Guest). To use AMD SMI for Virtualization, please refer to [AMD-SMI Virtualization](https://github.com/amd/MxGPU-Virtualization/tree/mainline/smi-lib). + +## Supported platforms + +The AMD SMI library supports Linux bare metal and Linux virtual machine guest +for AMD GPUs and AMD EPYC™ CPUs via +[esmi_ib_library](https://github.com/amd/esmi_ib_library). + +AMD SMI library can run on AMD ROCm supported platforms, refer to +[System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) +for more information. + +## Installation + +* [Install the AMD SMI library and CLI tool](https://rocm.docs.amd.com/projects/amdsmi/en/latest/install/install.html) + +## Requirements + +The following are required to install and use the AMD SMI library through its language interfaces and CLI. + +* `amdgpu` driver must be loaded for [`amdsmi_init()`](./docs/how-to/amdsmi-cpp-lib#hello-amd-smi) to work. Refer to the [Instinct documentation](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/install/detailed-install/prerequisites.html) for installation instructions. +* Export `LD_LIBRARY_PATH` to the `amdsmi` installation directory. + + ```bash + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm/lib64 + ``` + +### Python interface and CLI tool prerequisites + +* Python 3.6.8+ (64-bit) + +### Note: No module named more_itertools warning on Azure Linux 3 +During the driver installation process on Azure Linux 3, you might encounter the `ModuleNotFoundError: No module named 'more_itertools'` warning. This warning is a result of the reintroduction of `python3-wheel` and `python3-setuptools` dependencies in the CMake of AMD SMI, which requires `more_itertools` to build these Python libraries. This issue will be fixed in a future ROCm release. As a workaround, use the following command before installation: +``` +sudo python3 -m pip install more_itertools +``` + +### Go API prerequisites + +* Go version 1.20 or greater + +## AMD SMI basic usage + +### C++ library + +For developers focused on performance monitoring, system diagnostics, or resource management, the AMD SMI C++ library +offers a powerful and versatile tool to unlock the full capabilities of AMD hardware. + +Refer to the [user guide](https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/amdsmi-cpp-lib.html) and the +detailed [C++ API reference](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-cpp-api.html) in the +ROCm documentation portal. + +### Python library + +The AMD SMI Python interface provides an easy-to-use +[API](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-py-lib.html) for interacting with AMD +hardware. It simplifies tasks like monitoring and controlling GPU operations, allowing for rapid development. + +Refer to the [user guide](https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/amdsmi-py-lib.html) and the +detailed [Python API reference](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-py-api.html) in the +ROCm documentation portal. + +### Go library + +The AMD SMI Go interface provides a simple +[API](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-go-lib.html) +for AMD hardware management. It streamlines hardware monitoring and control +while leveraging Golang's features. + +Refer to the [user guide](https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/amdsmi-go-lib.html) and the +[Go API reference](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-go-api.html) in the +ROCm documentation portal. + +### CLI tool + +A versatile command line tool for managing and monitoring AMD hardware. You can use `amd-smi` for: + +- Device information: Quickly retrieve detailed information about AMD GPUs + +- Performance monitoring: Real-time monitoring of GPU utilization, memory, temperature, and power consumption + +- Process information: Identify which processes are using GPUs + +- Configuration management: Adjust GPU settings like clock speeds and power limits + +- Error reporting: Monitor and report GPU errors for proactive maintenance + +Check out +[Getting to Know Your GPU: A Deep Dive into AMD SMI -- ROCm Blogs](https://rocm.blogs.amd.com/software-tools-optimization/amd-smi-overview/README.html) +for a rundown. + +### Docker container configuration + +To ensure proper functionality of AMD SMI within a Docker container, the +following configuration options must be included. These settings are +particularly important for managing memory partitions, as partitioning depends +on loading and unloading kernel drivers. + +- `--cap-add=SYS_MODULE` + +- `-v /lib/modules:/lib/modules` + +See [Using AMD SMI in a Docker +container](https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/setup-docker-container.html) +for more information. + +## Building AMD SMI + +This section describes the prerequisites and steps to build AMD SMI from source. + +### Required software + +To build the AMD SMI library, the following components are required. Note that the software versions specified were used +during development; earlier versions are not guaranteed to work. + +* CMake (v3.20.0 or later) -- `python3 -m pip install cmake` +* g++ (v5.4.0 or later) +* libdrm-dev (for Ubuntu and Debian) +* libdrm-devel (for RPM-based distributions) + +In order to build the AMD SMI Python package, the following components are required: + +* Python (3.6.8 or later) +* virtualenv -- `python3 -m pip install virtualenv` + +### Build steps + +1. Clone the AMD SMI repository to your local Linux machine. + + ```shell + git clone https://github.com/ROCm/amdsmi.git + ``` + +2. The default installation location for the library and headers is `/opt/rocm`. Before installation, any old ROCm + directories should be deleted: + + * `/opt/rocm` + * `/opt/rocm-` + +3. Build the library by following the typical CMake build sequence (run as root user or use `sudo` before `make install` + command); for instance: + + ```bash + mkdir -p build + cd build + cmake .. + make -j $(nproc) + make install + ``` + + The built library is located in the `build/` directory. To build the `rpm` and `deb` packages use the following + command: + + ```bash + make package + ``` + +### Rebuild the Python wrapper + +The Python wrapper for the AMD SMI library is found in the [auto-generated file](#py_lib_fs) +`py-interface/amdsmi_wrapper.py`. It is essential to regenerate this wrapper whenever there are changes to the C++ API. +It is not regenerated automatically. + +To regenerate the wrapper, use the following command. + +```shell +./update_wrapper.sh +``` + +After this command, the file in `py-interface/amdsmi_wrapper.py` will be updated +on compile. + +>[!NOTE] +>You need Docker installed on your system to regenerate the Python wrapper. + +### Build the tests + +To verify the build and capabilities of AMD SMI on your system, as well as to see practical examples of its usage, you +can build and run the available [tests in the repository](https://github.com/ROCm/amdsmi/tree/amd-staging/tests). Follow +these steps to build the tests: + +```bash +mkdir -p build +cd build +cmake -DBUILD_TESTS=ON .. +make -j $(nproc) +``` + +#### Run the tests + +Once the tests are [built](#build-the-tests), you can run them by executing the `amdsmitst` program. The executable can +be found at `build/tests/amd_smi_test/`. + +### Build the docs + +To build the documentation, follow the instructions at +[Building documentation](https://rocm.docs.amd.com/en/latest/contribute/building.html). + +## DISCLAIMER + +The information contained herein is for informational purposes only, and is subject to change without notice. In +addition, any stated support is planned and is also subject to change. While every precaution has been taken in the +preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is +under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no +representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes +no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular +purposes, with respect to the operation or use of AMD hardware, software or other products described herein. + +© 2023-2025 Advanced Micro Devices, Inc. All Rights Reserved. diff --git a/projects/amdsmi/RPM/post.in b/projects/amdsmi/RPM/post.in new file mode 100755 index 0000000000..86cac35a16 --- /dev/null +++ b/projects/amdsmi/RPM/post.in @@ -0,0 +1,205 @@ +#!/bin/bash + +do_configureLogrotate() { + local IS_SYSTEMD=0 + local packageName="amd-smi-lib" + local logPath=/var/log/amd_smi_lib + local logFile="${logPath}/AMD-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + if ! command -v logrotate &>/dev/null; then + echo "[WARNING] Detected logrotate is not installed."\ + "$packageName logs (when turned on) will not rotate properly." + return + fi + + if [ ! -f $logrotateConfFile ]; then + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read + # AMD SMI logging rotation, rotates files using root user/group + # Hourly logrotation check + # Only rotates if size grew larger than 1MB + # Max of 4 rotation files, oldest will be removed + # Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42 + cat << EOF > "${logrotateConfFile}" +${logFile} { + su root root + hourly + missingok + notifempty + rotate 4 + size 1M + copytruncate + dateext + dateformat .%%Y-%%m-%%d_%H:%%M:%%S +} +EOF + # Fix for % S argument not found (now we escape with %%) + # issue was RPM build thought we were using macros + # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 + # https://rpm-software-management.github.io/rpm/manual/spec.html + sed -i s/%%/%/g "${logrotateConfFile}" + # workaround: remove extra 'OURCE' text + # from amd_smi.conf. Unsure if CMAKE, + # bash, or here document + # issue (only seen on RHEL 8.7) + sed -i s/OURCE//g "${logrotateConfFile}" + fi + # check if logrotate uses system timers, Ubuntu/modern OS's do + # Several older OS's like RHEL 8.7, do not. Instead defaults + # to use daily cron jobs - see https://stackoverflow.com/a/69465677 + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -eq 0 ]; then + IS_SYSTEMD=1 + fi + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then + # Configure systemd timers - the typical setup for modern Linux logrotation setups + if [ -f /lib/systemd/system/logrotate.timer ]; then + if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + fi + cat << EOF > /lib/systemd/system/logrotate.timer +[Unit] +Description=Hourly rotation of log files +Documentation=man:logrotate(8) man:logrotate.conf(5) + +[Timer] +OnCalendar= +OnCalendar=hourly +AccuracySec=1m +Persistent=true + +[Install] +WantedBy=timers.target +EOF + systemctl reenable --now logrotate.timer + else + echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ + "$packageName logs (when turned on) will not rotate properly." + fi + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi + fi +} + +do_ldconfig() { + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo $RPM_INSTALL_PREFIX0/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf + ldconfig + fi +} + +do_install_amdsmi_python_lib() { + # get python version + local python3_minor_version + python3_minor_version=$(python3 -c 'import sys;print(sys.version_info.minor)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine python version. "\ + "AMD-SMI python library will not be installed." + return + fi + + # check if python version is supported + if [ "$python3_minor_version" -lt 6 ]; then + echo "[WARNING] AMD-SMI python library is not "\ + "supported on python version 3.$python3_minor_version. "\ + "AMD-SMI python library will not be installed." + return + fi + + local PREVIOUS_PIP_ROOT_USER_ACTION="$PIP_ROOT_USER_ACTION" + export PIP_ROOT_USER_ACTION=ignore + # python3.11 requires --break-system-packages + local PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES="$PIP_BREAK_SYSTEM_PACKAGES" + export PIP_BREAK_SYSTEM_PACKAGES=1 + + + # Remove old python library + local amdsmi_pip_list_output + amdsmi_pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $amdsmi_pip_list_output == *"amdsmi"* ]]; then + echo "Detected old AMD-SMI python library (amdsmi)..." + python3 -m pip uninstall amdsmi --yes --quiet --disable-pip-version-check + echo "Removed old AMD-SMI python library (amdsmi)..." + fi + + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + + check_and_install_amdsmi() { + local setuptools_version + setuptools_version=$(python3 -c 'import setuptools; print(setuptools.__version__)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine setuptools version. "\ + "AMD-SMI python library will not be installed." + return + fi + + # install python library at $RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/amdsmi + local python_lib_path=$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@ + local amdsmi_python_lib_path="$python_lib_path" + local amdsmi_setup_py_path="$python_lib_path/setup.py" + + # Decide installation method based on setuptools version + if [[ "$(printf '%s\n' "$setuptools_version" "28.5" | sort -V | head -n1)" == "$setuptools_version" ]]; then + echo "[WARNING] Setuptools version is less than 28.5. AMD-SMI will not be installed." + elif [[ "$(printf '%s\n' "$setuptools_version" "41.0.1" | sort -V | head -n1)" != "41.0.1" ]]; then + echo "Using setup.py for installation due to setuptools version $setuptools_version" + cd $amdsmi_python_lib_path + python3 setup.py install + cd - + else + echo "Using pyproject.toml for installation due to setuptools version $setuptools_version" + python3 -m pip install "$amdsmi_python_lib_path" --quiet --disable-pip-version-check --no-build-isolation --no-index + fi +} + + # Call the function + check_and_install_amdsmi + + export PIP_ROOT_USER_ACTION="$PREVIOUS_PIP_ROOT_USER_ACTION" + export PIP_BREAK_SYSTEM_PACKAGES="$PREVIOUS_PIP_BREAK_SYSTEM_PACKAGES" + + # only try to activate argcomplete if such command exists + # python3-argcomplete is recommended but optional, we handle its absence gracefully + if command -v activate-global-python-argcomplete &>/dev/null; then + activate-global-python-argcomplete 2>/dev/null || { + echo "[INFO] Bash completion activation skipped. You can manually enable it with: activate-global-python-argcomplete" + } + else + # try older argcomplete3 version + if command -v activate-global-python-argcomplete3 &>/dev/null; then + activate-global-python-argcomplete3 2>/dev/null || { + echo "[INFO] Bash completion activation skipped. You can manually enable it with: activate-global-python-argcomplete3" + } + else + echo "[WARNING] Could not find argcomplete activation command. "\ + "Argument completion will not work. Install python3-argcomplete package to enable it." + fi + fi +} + + +# post install or upgrade, $i is 1 or 2 -> do these actions +if [ "$1" -ge 1 ]; then + do_install_amdsmi_python_lib + do_ldconfig + do_configureLogrotate || exit 0 +fi diff --git a/projects/amdsmi/RPM/postun.in b/projects/amdsmi/RPM/postun.in new file mode 100755 index 0000000000..143ce178e2 --- /dev/null +++ b/projects/amdsmi/RPM/postun.in @@ -0,0 +1,8 @@ +#!/bin/bash + +# second term originates from ENABLE_LDCONFIG = ON/OFF at package build +if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then + # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations + rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf + ldconfig +fi diff --git a/projects/amdsmi/RPM/preun.in b/projects/amdsmi/RPM/preun.in new file mode 100755 index 0000000000..f24c9cff2d --- /dev/null +++ b/projects/amdsmi/RPM/preun.in @@ -0,0 +1,95 @@ +#!/bin/bash + +rm_leftovers() { + # remove pyc files generated by python + rm -rf "$RPM_INSTALL_PREFIX0/@CMAKE_INSTALL_LIBEXECDIR@/amdsmi_cli/__pycache__" + rm -rf "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__" + + # remove build and egg files + rm -rf "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/amdsmi.egg-info" + rm -rf "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/build" + + # remove dist files (only applies to old setuptools versions like on RHEL8) + rm -rf "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/dist" + + # remove leftover doc files + if test -e "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*"; then + rm -rf "$RPM_INSTALL_PREFIX0/@SHARE_INSTALL_PREFIX@/../doc/amd_smi*" + fi +} + + +rm_logFolder() { + rm -rf /var/log/amd_smi_lib +} + + +rm_rocm_tests_dir(){ + if [ -d "$RPM_INSTALL_PREFIX0/share/amd_smi/tests/" ]; then + rm -rf "$RPM_INSTALL_PREFIX0/share/amd_smi/tests/" + echo "Removed ROCm tests directory." + fi +} + + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi +} + +rm_python_lib() { + # get python version + local python3_minor_version + python3_minor_version=$(python3 -c 'import sys;print(sys.version_info.minor)') + if [ $? -ne 0 ]; then + echo "[WARNING] Could not determine python version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # check if python version is supported + if [ "$python3_minor_version" -lt 6 ]; then + echo "[WARNING] AMD-SMI python library is not supported on python version 3.$python3_minor_version. "\ + "AMD-SMI python library will not be uninstalled." + return + fi + + # Remove old python library + local pip_list_output + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + PIP_ROOT_USER_ACTION=ignore PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip uninstall amdsmi --yes --quiet --disable-pip-version-check + fi + + pip_list_output=$(python3 -m pip list --format=columns --disable-pip-version-check) + # check pip list output for amdsmi + if [[ $pip_list_output == *"amdsmi"* ]]; then + echo "[WARNING] AMD-SMI python library (amdsmi) is still installed in pip. "\ + "Check post install to ensure version is correct" + else + echo "Removed AMD-SMI python library (amdsmi)..." + fi +} + + +if [ "$1" -le 1 ]; then + # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations + # remove old gpuv-smi symlink + rm -f $RPM_INSTALL_PREFIX0/bin/gpuv-smi &> /dev/null + rm_python_lib + rm_leftovers + rm_logFolder + rm_rocm_tests_dir + return_logrotateToOrigConfig +fi diff --git a/projects/amdsmi/amd_smi-config.cmake.in b/projects/amdsmi/amd_smi-config.cmake.in new file mode 100644 index 0000000000..ba12697633 --- /dev/null +++ b/projects/amdsmi/amd_smi-config.cmake.in @@ -0,0 +1,27 @@ +# - Config file for the amd_smi package +# It defines the following variables +# AMD_SMI_INCLUDE_DIRS - include directories for amd_smi +# AMD_SMI_LIBRARIES - libraries to link against + +# Compute paths +@PACKAGE_INIT@ +get_filename_component(AMD_SMI_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + +set_and_check(amd_smi_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") +set_and_check(AMD_SMI_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") +set_and_check(AMD_SMI_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") +set_and_check(amd_smi_LIB_DIR "@PACKAGE_CMAKE_INSTALL_LIBDIR@") +set_and_check(AMD_SMI_LIB_DIR "@PACKAGE_CMAKE_INSTALL_LIBDIR@") +set_and_check(AMD_SMI_LIB_DIRS "@PACKAGE_CMAKE_INSTALL_LIBDIR@") + +# Our library dependencies (contains definitions for IMPORTED targets) +if(NOT TARGET amd_smi AND NOT amd_smi_BINARY_DIR) + include("${AMD_SMI_CMAKE_DIR}/amd_smiTargets.cmake") +endif() + +# These are IMPORTED targets created by AmdSmiTargets.cmake +# TODO: Need to check if OAM libraries are needed here! +set(AMD_SMI_LIBRARIES amd_smi) +set(AMD_SMI_LIBRARY amd_smi) + +check_required_components(amd_smi) diff --git a/projects/amdsmi/amdsmi_cli/BDF.py b/projects/amdsmi/amdsmi_cli/BDF.py new file mode 100644 index 0000000000..871f20916a --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/BDF.py @@ -0,0 +1,128 @@ +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import logging +import re + + +class BDF(): + """ BDF Class to cast and compare BDF objects using built-in python comparators + + Useful for validating a BDF string and converting it to a BDF object + This allows us to handle BDF objects in a pythonic way + + Attributes: + __eq__: The equals comparator + __: An integer count of the eggs we have laid. + """ + def __init__(self, bdf): + """Init a BDF object""" + if isinstance(bdf, BDF): + self.segment, self.bus, self.device, self.function = tuple(bdf) + else: + if bdf.startswith("BDF("): + bdf = bdf.replace('BDF(', '').replace(')', '') + + try: + bdf_components = [int(x, 16) for x in re.split('[:.]', bdf)] + except self.BDFError as e: + logging.error(f"Invalid string passed: {bdf}") + raise e + + self.segment = bdf_components[0] if len(bdf_components) == 4 else 0 + self.bus, self.device, self.function = bdf_components[-3:] + if self.segment > 65535: + raise self.BDFError("Segment can't be greater than 65535") + if self.bus > 255: + raise self.BDFError("Bus can't be greater than 255") + if self.device > 31: + raise self.BDFError("Device can't be greater than 31") + if self.function > 7: + raise self.BDFError("Function can't be greater than 7") + + + class BDFError(Exception): + """BDF Class Error""" + + + def __eq__(self, passed_bdf): + """Overrides the == operator and allows for BDF objects to be compared to BDF strings""" + + # Only accept strings and BDF objects + if isinstance(passed_bdf, str): + if passed_bdf == '': + return False + passed_bdf = BDF(passed_bdf) + elif not isinstance(passed_bdf, BDF): + return False + + if self.segment == passed_bdf.segment and \ + self.bus == passed_bdf.bus and \ + self.device == passed_bdf.device and \ + self.function == passed_bdf.function: + return True + else: + return False + + + def __ne__(self, passed_bdf): + """Overrides the != operator and allows for BDF objects to be compared to BDF strings""" + # Since we overrided the == operator we can use that to make this simple + return not self == passed_bdf + + + def __add__(self, passed_bdf): + """Overrides the + operator and allows for string concatenation""" + return str(self) + passed_bdf + + + def __radd__(self, passed_bdf): + """Overrides the + operator and allows for string concatenation""" + return passed_bdf + str(self) + + + def __str__(self): + """Cast BDF object to a string""" + return "{:04X}:{:02X}:{:02X}:{}".format(self.segment, self.bus, self.device, self.function) + + + def __repr__(self): + """How the BDF object is represented""" + return f"BDF({self})" + + + def __hash__(self): + """Allow the BDF object to be hashable""" + return hash(str(self)) + + + def __iter__(self): + """Make the BDF object iterable over its 4 values""" + yield from (self.segment, self.bus, self.device, self.function) + + + def __contains__(self, passed_bdf): + """Overrided the 'in' comparator in python""" + passed_bdf = str(BDF(passed_bdf)) + + bdf_regex = "(?:[0-6]?[0-9a-fA-F]{1,4}:)?[0-2]?[0-9a-fA-F]{1,2}:[0-9a-fA-F]{1,2}\\.[0-7]" + for match in re.findall(bdf_regex, passed_bdf): + if self == match: + return True + return False diff --git a/projects/amdsmi/amdsmi_cli/CMakeLists.txt b/projects/amdsmi/amdsmi_cli/CMakeLists.txt new file mode 100644 index 0000000000..d9728176c2 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/CMakeLists.txt @@ -0,0 +1,80 @@ +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") +message(" CMake AMDSMI CLI Install ") +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") + +# Set CLI Build Directory +set(PY_PACKAGE_DIR "amdsmi_cli") +set(PY_CLI_INSTALL_DIR "${CMAKE_INSTALL_LIBEXECDIR}" CACHE STRING "CLI tool installation directory") + +# populate version string +configure_file(_version.py.in ${PY_PACKAGE_DIR}/_version.py @ONLY) + +# copy only if files are different +add_custom_command( + OUTPUT ${PY_PACKAGE_DIR}/__init__.py + ${PY_PACKAGE_DIR}/amdsmi_cli.py + ${PY_PACKAGE_DIR}/amdsmi_commands.py + ${PY_PACKAGE_DIR}/amdsmi_helpers.py + ${PY_PACKAGE_DIR}/amdsmi_init.py + ${PY_PACKAGE_DIR}/amdsmi_logger.py + ${PY_PACKAGE_DIR}/amdsmi_parser.py + ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py + ${PY_PACKAGE_DIR}/BDF.py + ${PY_PACKAGE_DIR}/README.md + ${PY_PACKAGE_DIR}/Release_Notes.md + DEPENDS amdsmi_cli + COMMAND mkdir -p ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_commands.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_helpers.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_init.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_logger.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_parser.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/amdsmi_cli_exceptions.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/BDF.py ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${PY_PACKAGE_DIR}/ + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/Release_Notes.md ${PY_PACKAGE_DIR}/) + +# The CLI requires the python amdsmi wrapper to be installed +add_custom_target( + amdsmi_cli ALL + DEPENDS python_package + ${PY_PACKAGE_DIR}/__init__.py + ${PY_PACKAGE_DIR}/_version.py + ${PY_PACKAGE_DIR}/amdsmi_cli.py + ${PY_PACKAGE_DIR}/amdsmi_commands.py + ${PY_PACKAGE_DIR}/amdsmi_helpers.py + ${PY_PACKAGE_DIR}/amdsmi_init.py + ${PY_PACKAGE_DIR}/amdsmi_logger.py + ${PY_PACKAGE_DIR}/amdsmi_parser.py + ${PY_PACKAGE_DIR}/amdsmi_cli_exceptions.py + ${PY_PACKAGE_DIR}/BDF.py + ${PY_PACKAGE_DIR}/README.md + ${PY_PACKAGE_DIR}/Release_Notes.md) + +install( + DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${PY_PACKAGE_DIR} + DESTINATION ${PY_CLI_INSTALL_DIR} + COMPONENT dev) + +install( + PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${PY_PACKAGE_DIR}/amdsmi_cli.py + DESTINATION ${PY_CLI_INSTALL_DIR}/${PY_PACKAGE_DIR} + COMPONENT dev) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) + +# symlink amdsmi_cli.py to amd-smi +add_custom_target( + link_amdsmi_cli ALL + DEPENDS amdsmi_cli + BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/bin/amd-smi + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${PY_CLI_INSTALL_DIR}/${PY_PACKAGE_DIR}/amdsmi_cli.py + ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}/amd-smi) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/bin/amd-smi + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT dev) diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md new file mode 100644 index 0000000000..4487835c5e --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -0,0 +1,28 @@ +# AMD SMI CLI tool + +A command line tool for manipulating and monitoring the `amdgpu` kernel; +`amd-smi` is intended to replace and deprecate the existing +[`rocm-smi`](https://github.com/rocm/rocm_smi_lib) CLI tool. + +When using the CLI tool, you should have at least one AMD GPU and the driver +installed. + +>[!NOTE] +>The AMD SMI CLI tool is provided as an example code to aid the development of +>telemetry tools. The Python or C++ library is recommended as a robust data +>source. + +Find the documentation in the `docs/` directory. + +- [Install AMD SMI](../docs/install/install.md) +- [About the tool and how to get started](../docs/how-to/amdsmi-cli-tool.md) + +## Online documentation + +Explore the latest documentation on the [ROCm documentation +portal](https://rocm.docs.amd.com/projects/en/latest/index.html). + +- [Install AMD SMI](https://rocm.docs.amd.com/projects/amdsmi/en/latest/install/install.html) + +- [CLI tool usage](https://rocm.docs.amd.com/projects/amdsmi/en/latest/how-to/amdsmi-cli-tool.html). + diff --git a/projects/amdsmi/amdsmi_cli/Release_Notes.md b/projects/amdsmi/amdsmi_cli/Release_Notes.md new file mode 100644 index 0000000000..ce0dca0faa --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/Release_Notes.md @@ -0,0 +1,52 @@ +# Release Notes + +## Documentation + +Documentation for AMDSMI-CLI is available post install in /opt//libexec/amdsmi_cli/README.md + +## AMDSMI-CLI 23.3.1.0 + +- not all ecc fields are currently supported +- RHEL 8 & SLES 15 may have extra install steps + +## AMDSMI-CLI 23.0.1.1 + +### Known Issues + +- not all ecc fields are currently supported +- RHEL 8 & SLES 15 have extra install steps + +## AMDSMI-CLI 23.0.1.0 + +### Known Issues + +- not all ecc fields are currently supported +- RHEL 8 & SLES 15 have extra install steps + +## AMDSMI-CLI 23.0.0.4 + +### Added + +- AMDSMI-CLI tool enabled for Linux Baremetal & Guest +- Added CSV & Watch modifier +- Added topology subcommand + +### Known Issues + +- not all ecc fields are currently supported +- RHEL 8 & SLES 15 have extra install steps + +## AMDSMI-CLI 0.0.2 + +### Added + +- AMDSMI-CLI tool enabled for Linux Baremetal & Guest + +### Known Issues + +- ecc & ras subcommands will report N/A even if RAS is enabled +- process vram_mem's unit is listed as percentage vs bytes +- csv modifier does not work +- topology information is not yet enabled +- watch modifier not fully enabled +- limited guest support diff --git a/projects/amdsmi/amdsmi_cli/__init__.py b/projects/amdsmi/amdsmi_cli/__init__.py new file mode 100644 index 0000000000..6a5649080e --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/__init__.py @@ -0,0 +1 @@ +from _version import __version__ diff --git a/projects/amdsmi/amdsmi_cli/_version.py.in b/projects/amdsmi/amdsmi_cli/_version.py.in new file mode 100644 index 0000000000..a8869a0165 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/_version.py.in @@ -0,0 +1 @@ +__version__ = "@amd_smi_lib_VERSION_STRING@" diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py new file mode 100755 index 0000000000..1c43d48c8c --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +# PYTHON_ARGCOMPLETE_OK +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import logging +import sys +import os + +try: + import argcomplete +except ImportError as e: + logging.debug(f"Unhandled import error: {e}") + logging.debug("argcomplete module not found. Autocomplete will not work.") + +# from typing import TYPE_CHECKING +# # only used for type checking +# # pyright trips up and cannot find amdsmi scripts without it +# if TYPE_CHECKING: +# from amdsmi_commands import AMDSMICommands +# from amdsmi_parser import AMDSMIParser +# from amdsmi_logger import AMDSMILogger +# import amdsmi_cli_exceptions +# from amdsmi import amdsmi_interface +# from amdsmi import amdsmi_exception + +# Set the environment variable for GPU metrics cache duration +gpu_metrics_cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100") +logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", gpu_metrics_cache_ms) + +# Set the environment variable for ASIC cache duration +asic_info_cache_ms = os.environ.setdefault("AMDSMI_ASIC_INFO_CACHE_MS", "10000") # 10 seconds +logging.debug("AMDSMI_ASIC_INFO_CACHE_MS = %sms", asic_info_cache_ms) + +try: + from amdsmi_init import * + from amdsmi_helpers import AMDSMIHelpers + from amdsmi_commands import AMDSMICommands + from amdsmi_parser import AMDSMIParser + from amdsmi_logger import AMDSMILogger + import amdsmi_cli_exceptions +except ImportError: + current_path = os.path.dirname(os.path.abspath(__file__)) + cli_files_path = f"{current_path}/../libexec/amdsmi_cli" + sys.path.append(cli_files_path) + try: + from amdsmi_init import * + from amdsmi_helpers import AMDSMIHelpers + from amdsmi_commands import AMDSMICommands + from amdsmi_parser import AMDSMIParser + from amdsmi_logger import AMDSMILogger + import amdsmi_cli_exceptions + except ImportError as e: + print(f"Unhandled import error: {e}") + print(f"Unable to import amdsmi_cli files. Check {cli_files_path} if they are present.") + sys.exit(1) + +def _print_error(e, destination): + if destination in ['stdout', 'json', 'csv']: + print(e) + else: + f = open(destination, "w", encoding="utf-8") + f.write(e) + f.close() + print("Error occurred. Result written to " + str(destination) + " file") + +def configure_logging_and_execute(args, amd_smi_commands): + """ + Configures logging based on the provided arguments and executes the subcommand. + + Args: + args: Parsed command-line arguments. + amd_smi_commands: Instance of AMDSMICommands. + """ + # Remove previous log handlers + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) + + # To enable debug logs in AMD SMI library: + # set RSMI_LOGGING = 1 for logging to files + # set RSMI_LOGGING = 2 for logging to stdout + # set RSMI_LOGGING = 3 for logging to stdout and files + # set RSMI_LOGGING = 0 to disable logging + # Files will be located in /var/log/amd_smi_lib/AMD-SMI-lib.log* + + # log string with the following format: + # loglevel | YYYY-MM-DD HH:MM:SS.ms | filename:line | message + logging_dict = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + time = '%(asctime)s.%(msecs)03d' + datefmt = '%Y-%m-%d %H:%M:%S' + logging.basicConfig(format='%(levelname)s | ' + time + ' | %(filename)s:%(lineno)d | %(message)s', + level=logging_dict[args.loglevel], datefmt=datefmt) + + # Disable traceback for non-debug log levels + if args.loglevel == "DEBUG": + sys.tracebacklimit = 10 + else: + sys.tracebacklimit = -1 + + logging.debug(args) + + # Execute subcommands + try: + args.func(args) + except amdsmi_cli_exceptions.AmdSmiException as e: + _print_error(str(e), amd_smi_commands.logger.destination) + except amdsmi_exception.AmdSmiLibraryException as e: + exc = amdsmi_cli_exceptions.AmdSmiLibraryErrorException(amd_smi_commands.logger.format, e.get_error_code()) + _print_error(str(exc), amd_smi_commands.logger.destination) + + +if __name__ == "__main__": + # Disable traceback before possible init errors in AMDSMICommands and AMDSMIParser + copy_argv = str(sys.argv.copy()).upper() + if "DEBUG" in copy_argv: + sys.tracebacklimit = 10 + else: + sys.tracebacklimit = -1 + + amd_smi_helpers = AMDSMIHelpers() + amd_smi_commands = AMDSMICommands(helpers=amd_smi_helpers) + amd_smi_parser = AMDSMIParser(amd_smi_commands.version, + amd_smi_commands.list, + amd_smi_commands.static, + amd_smi_commands.firmware, + amd_smi_commands.bad_pages, + amd_smi_commands.metric, + amd_smi_commands.process, + amd_smi_commands.profile, + amd_smi_commands.event, + amd_smi_commands.topology, + amd_smi_commands.set_value, + amd_smi_commands.reset, + amd_smi_commands.monitor, + amd_smi_commands.xgmi, + amd_smi_commands.partition, + amd_smi_commands.ras, + amd_smi_commands.node, + amd_smi_commands.default, + sys_argv=sys.argv, + helpers=amd_smi_helpers) + try: + argcomplete.autocomplete(amd_smi_parser) + except NameError: + logging.debug("argcomplete module not found. Autocomplete will not work.") + + # Store possible subcommands & aliases for later errors + valid_commands = amd_smi_parser.possible_commands + valid_commands += ['--help', '-h'] + + # Convert arguments to lowercase, but preserve case for folder path values + processed_argv = [] + # Arguments that should preserve case + case_sensitive_args = ['--folder', '--file', '--gpu', '--cpu', '--core', '--profile', '--cper-file'] + case_sensitive_prefixes = ['--folder=', '--file=', '--gpu=', '--cpu=', '--core=', '--profile=', '--cper-file='] + + preserve_case_for_next = False + for i, arg in enumerate(sys.argv): + if preserve_case_for_next: + # Preserve case for the next argument value + processed_argv.append(arg) + preserve_case_for_next = False + elif arg in case_sensitive_args: + # Convert flag to lowercase but preserve next value + processed_argv.append(arg.lower()) + preserve_case_for_next = True + elif any(arg.startswith(prefix) for prefix in case_sensitive_prefixes): + # Handle --arg=value format, preserve case for the value part + for prefix in case_sensitive_prefixes: + if arg.startswith(prefix): + flag = prefix.rstrip('=') + value = arg[len(prefix):] + processed_argv.append(flag.lower() + '=' + value) + break + elif arg.startswith('--') or not arg.startswith('-'): + # Convert other long options and positional arguments to lowercase + processed_argv.append(arg.lower()) + else: + # Preserve case for short options + processed_argv.append(arg) + sys.argv = processed_argv + + if len(sys.argv) == 1: + args = amd_smi_parser.parse_args(args=['default']) + elif sys.tracebacklimit == 10 and (sys.argv[1] == '--loglevel'): + args = amd_smi_parser.parse_args(args=['default', '--loglevel'] + sys.argv[2:]) + elif sys.argv[1] in valid_commands: + args = amd_smi_parser.parse_args(args=None) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidSubcommandException(sys.argv[1],amd_smi_commands.logger.destination) + + # Handle command modifiers before subcommand execution + # human readable is the default output format + if hasattr(args, 'json') and args.json: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.json.value + if hasattr(args, 'csv') and args.csv: + amd_smi_commands.logger.format = amd_smi_commands.logger.LoggerFormat.csv.value + if hasattr(args, 'file') and args.file: + amd_smi_commands.logger.destination = args.file + configure_logging_and_execute(args, amd_smi_commands) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py new file mode 100644 index 0000000000..a6f4aebaf4 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli_exceptions.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import json + + +AMDSMI_ERROR_MESSAGES = { + 0: "Sucess", + 1: "Invalid parameters", + 2: "Command not supported", + 3: "Command not yet implemented", + 4: "Failed load module", + 5: "Failed load symbole", + 6: "Drm error", + 7: "API call failed", + 8: "Timeout in API call", + 9: "Retry operation", + 10: "Permission Denied", + 11: "Interrupt ocurred during execution", + 12: "I/O Error", + 13: "Address fault", + 14: "Error opening file", + 15: "Not enough memory", + 16: "Internal error", + 17: "Out of bounds", + 18: "Initialization error", + 19: "Internal reference counter exceeded", + # Reserved for future error messages + 30: "Device busy", + 31: "Device Not found", + 32: "Device not initialized", + 33: "No more free slot", + 34: "Driver not loaded", + # Reserved for future error messages + 40: "No data was found for given input", + 41: "Insufficient size for operation", + 42: "Unexpected size of data was read", + 43: "The data read or provided was unexpected", + 44: "System has different cpu than AMD", + 45: "Energy driver not found", + 46: "MSR driver not found", + 47: "HSMP driver not found", + 48: "HSMP not supported", + 49: "HSMP message/feature not supported", + 50: "HSMP message timed out", + 51: "No Energy and HSMP driver present", + 52: "File or directory not found", + 53: "Parsed argument is invalid", + 54: "AMDGPU restart error", + 55: "Setting is not available", + 0xFFFFFFFE: "AMD-SMI Library error did not map to a status code", + 0xFFFFFFFF: "Unknown error" +} + +def _get_error_message(error_code): + if abs(error_code) in AMDSMI_ERROR_MESSAGES: + return AMDSMI_ERROR_MESSAGES[abs(error_code)] + return "Generic error" + + +class AmdSmiException(Exception): + def __init__(self): + self.json_message = {} + self.csv_message = '' + self.stdout_message = '' + self.message = '' + self.output_format = '' + self.device_type = '' + + def __str__(self): + # Return message according to the current output format + if self.output_format == 'json': + self.message = json.dumps(self.json_message) + elif self.output_format == 'csv': + self.message = self.csv_message + else: + self.message = self.stdout_message + + return self.message + + +class AmdSmiInvalidCommandException(AmdSmiException): + def __init__(self, command, outputformat: str, message=None): + super().__init__() + self.value = -1 + self.command = command + self.output_format = outputformat + + common_message = f"Command '{self.command}' is invalid. Run 'amd-smi -h' for more info." + + if message: + common_message = message + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiInvalidParameterException(AmdSmiException): + def __init__(self, command, arg, outputformat: str): + super().__init__() + self.value = -2 + self.command = command + self.arg = arg + self.output_format = outputformat + + common_message = f"Parameter '{self.arg}' is invalid. Run 'amd-smi {self.command} -h' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiDeviceNotFoundException(AmdSmiException): + def __init__(self, command, outputformat: str, gpu: bool, cpu: bool, core: bool): + super().__init__() + self.value = -3 + self.command = command + self.output_format = outputformat + + # Handle different devices + self.device_type = "" + if gpu: + self.device_type = "GPU" + elif cpu: + self.device_type = "CPU" + elif core: + self.device_type = "CPU CORE" + + common_message = f"Can not find a device: {self.device_type} '{self.command}'" + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiInvalidFilePathException(AmdSmiException): + def __init__(self, command, outputformat: str, message=None): + super().__init__() + self.value = -4 + self.command = command + self.output_format = outputformat + + common_message = f"Path '{self.command}' cannot be found." + + if message: + common_message = message + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiInvalidParameterValueException(AmdSmiException): + def __init__(self, command, arg, outputformat: str): + super().__init__() + self.value = -5 + self.command = command + self.arg = arg + self.output_format = outputformat + + common_message = f"Value '{self.arg}' is not of valid type or format. Run 'amd-smi {self.command} -h' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiMissingParameterValueException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -6 + self.command = command + self.output_format = outputformat + + common_message = f"Parameter '{self.command}' requires a value. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiCommandNotSupportedException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -7 + self.command = command + self.output_format = outputformat + + common_message = f"Command '{self.command}' is not supported on the system. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiParameterNotSupportedException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -8 + self.command = command + self.output_format = outputformat + + common_message = f"Parameter '{self.command}' is not supported on the system. Run '--help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiRequiredCommandException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -9 + self.command = command + self.output_format = outputformat + + common_message = f"Command '{self.command}' requires a target argument. Run 'amd-smi {self.command} -h' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiInvalidSubcommandException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -10 + self.command = command + self.output_format = outputformat + + common_message = f"AMD-SMI Command '{self.command}' is invalid. Must receive valid AMD-SMI Command first. Run 'amd-smi -h' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiPermissionDeniedException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -11 + self.command = command + self.output_format = outputformat + + common_message = f"AMD-SMI Command '{self.command}' requires elevation (sudo privileges required)" + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiUnknownErrorException(AmdSmiException): + def __init__(self, command, outputformat: str): + super().__init__() + self.value = -100 + self.command = command + self.output_format = outputformat + + common_message = "An unknown error has occurred. Run 'help' for more info." + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" + + +class AmdSmiLibraryErrorException(AmdSmiException): + def __init__(self, outputformat: str, error_code): + super().__init__() + self.value = -1000 - abs(error_code) + self.smilibcode = error_code + self.output_format = outputformat + + common_message = f"AMDSMI has returned error '{self.value}' - '{AMDSMI_ERROR_MESSAGES[abs(self.smilibcode)]}'" + + self.json_message["error"] = common_message + self.json_message["code"] = self.value + self.csv_message = f"error,code\n{common_message}, {self.value}" + self.stdout_message = f"{common_message} Error code: {self.value}" diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py new file mode 100644 index 0000000000..9669683891 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -0,0 +1,7598 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import json +import logging +import multiprocessing +import os +import signal +import sys +import threading +import time +import copy + +from _version import __version__ +from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequiredCommandException, AmdSmiInvalidCommandException +from amdsmi_helpers import AMDSMIHelpers +from amdsmi_logger import AMDSMILogger +from amdsmi import amdsmi_exception, amdsmi_interface +from pathlib import Path + +class AMDSMICommands(): + """This class contains all the commands corresponding to AMDSMIParser + Each command function will interact with AMDSMILogger to handle + displaying the output to the specified format and destination. + """ + + def __init__(self, format='human_readable', destination='stdout', helpers=None) -> None: + if helpers is None: + # If helpers is not provided, create a new instance + self.helpers = AMDSMIHelpers() + else: + self.helpers = helpers + self.logger = AMDSMILogger(format=format, destination=destination, helpers=self.helpers) + self.device_handles = [] + self.cpu_handles = [] + self.core_handles = [] + self.node_handle = None + self.stop = '' + self.group_check_printed = False + + amdsmi_init_flag = self.helpers.get_amdsmi_init_flag() + logging.debug(f"AMDSMI Init Flag: {amdsmi_init_flag}") + exit_flag = False + + if self.helpers.is_amdgpu_initialized(): + try: + self.device_handles = amdsmi_interface.amdsmi_get_processor_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)') + else: + raise e + + if len(self.device_handles) == 0: + # No GPU's found post amdgpu driver initialization + logging.error('Unable to detect any GPU devices, check amdgpu version and module status (sudo modprobe amdgpu)') + exit_flag = True + + # Resolve the node handle. + for dev in self.device_handles: + try: + nh = amdsmi_interface.amdsmi_get_node_handle(dev) + if nh is not None: + self.node_handle = nh + continue + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL): + logging.debug("Unable to get node handle: %s", e.get_error_info()) + else: + raise e + + if self.helpers.is_amd_hsmp_initialized(): + try: + self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DRV): + logging.info('Unable to detect any CPU devices, check amd_hsmp (or) hsmp_acpi version and module status (sudo modprobe amd_hsmp (or) sudo modprobe hsmp_acpi)') + else: + raise e + + # core handles + try: + self.core_handles = amdsmi_interface.amdsmi_get_cpucore_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DRV): + logging.info('Unable to get CORE devices, amd_hsmp driver not loaded (sudo modprobe amd_hsmp)') + else: + raise e + + if len(self.cpu_handles) == 0 and len(self.core_handles) == 0: + # No CPU's found post amd_hsmp driver initialization + logging.error('Unable to detect any CPU devices, check amd_hsmp (or) hsmp_acpi version and module status (sudo modprobe amd_hsmp (or) sudo modprobe hsmp_acpi)') + exit_flag = True + + self.convert_clock_type = { + "sys": amdsmi_interface.AmdSmiClkType.SYS, + "mem": amdsmi_interface.AmdSmiClkType.MEM, + "df": amdsmi_interface.AmdSmiClkType.DF, + "soc": amdsmi_interface.AmdSmiClkType.SOC, + "dcef": amdsmi_interface.AmdSmiClkType.DCEF, + # vclk and dclk currently do not support levels so average clk is given for frequency levels + "vclk0": amdsmi_interface.AmdSmiClkType.VCLK0, + "vclk1": amdsmi_interface.AmdSmiClkType.VCLK1, + "dclk0": amdsmi_interface.AmdSmiClkType.DCLK0, + "dclk1": amdsmi_interface.AmdSmiClkType.DCLK1 + } + + if exit_flag: + version_args = argparse.Namespace() + version_args.gpu_version = False + version_args.cpu_version = False + self.version(version_args) + sys.exit(-1) + + + def version(self, args, gpu_version=None, cpu_version=None): + """Print Version String + + Args: + args (Namespace): Namespace containing the parsed CLI args + """ + + if gpu_version: + args.gpu_version = gpu_version + if cpu_version: + args.cpu_version = cpu_version + # if no args are given, display everything + if args.gpu_version is None and args.cpu_version is None: + args.gpu_version = True + args.cpu_version = True + + try: + amdsmi_lib_version = amdsmi_interface.amdsmi_get_lib_version() + amdsmi_lib_version_str = f"{amdsmi_lib_version['major']}.{amdsmi_lib_version['minor']}.{amdsmi_lib_version['release']}" + except amdsmi_exception.AmdSmiLibraryException as e: + amdsmi_lib_version_str = e.get_error_info() + + try: + rocm_lib_status, rocm_version_str = amdsmi_interface.amdsmi_get_rocm_version() + if rocm_lib_status is not True: + rocm_version_str = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + rocm_version_str = e.get_error_info() + + self.logger.output['tool'] = 'AMDSMI Tool' + self.logger.output['version'] = f'{__version__}' + self.logger.output['amdsmi_library_version'] = f'{amdsmi_lib_version_str}' + self.logger.output['rocm_version'] = f'{rocm_version_str}' + + if args.gpu_version: + try: + gpus = amdsmi_interface.amdsmi_get_processor_handles() + if isinstance(gpus, list) and len(gpus) > 0: + gpu_version_info = amdsmi_interface.amdsmi_get_gpu_driver_info(gpus[0]) + gpu_version_str = gpu_version_info['driver_version'] + else: + gpu_version_str = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_version_str = e.get_error_info() + self.logger.output['amdgpu_version'] = gpu_version_str + if args.cpu_version: + try: + cpus = amdsmi_interface.amdsmi_get_cpusocket_handles() + if isinstance(cpus, list) and len(cpus) > 0: + cpu_version_info = amdsmi_interface.amdsmi_get_cpu_hsmp_driver_version(cpus[0]) + cpu_version_str = str(cpu_version_info['hsmp_driver_major_ver_num']) + "." + str(cpu_version_info['hsmp_driver_minor_ver_num']) + else: + cpu_version_str = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + cpu_version_str = e.get_error_info() + self.logger.output['amd_hsmp_driver_version'] = cpu_version_str + + if self.logger.is_human_readable_format(): + human_readable_output = f"AMDSMI Tool: {__version__} | " \ + f"AMDSMI Library version: {amdsmi_lib_version_str} | " \ + f"ROCm version: {rocm_version_str}" + if args.gpu_version: + human_readable_output = human_readable_output + f" | amdgpu version: {gpu_version_str}" + if args.cpu_version: + human_readable_output = human_readable_output + f" | hsmp version: {cpu_version_str}" + # Custom human readable handling for version + if self.logger.destination == 'stdout': + print(human_readable_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(human_readable_output + '\n') + elif self.logger.is_json_format() or self.logger.is_csv_format(): + self.logger.print_output() + + + def list(self, args, multiple_devices=False, gpu=None): + """List information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list) + if handled_multiple_gpus: + return # This function is recursive + + args.gpu = device_handle + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + # Always try to get BDF regardless of group check + try: + bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + bdf = "N/A" + + try: + uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu) + except amdsmi_exception.AmdSmiLibraryException: + uuid = "N/A" + + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + kfd_id = kfd_info['kfd_id'] + node_id = kfd_info['node_id'] + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + kfd_id = node_id = partition_id = "N/A" + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + + # CSV format is intentionally aligned with Host + if self.logger.is_csv_format(): + self.logger.store_output(args.gpu, 'gpu_bdf', bdf) + self.logger.store_output(args.gpu, 'gpu_uuid', uuid) + else: + self.logger.store_output(args.gpu, 'bdf', bdf) + self.logger.store_output(args.gpu, 'uuid', uuid) + + self.logger.store_output(args.gpu, 'kfd_id', kfd_id) + self.logger.store_output(args.gpu, 'node_id', node_id) + self.logger.store_output(args.gpu, 'partition_id', partition_id) + + if args.e: + try: + enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu) + except amdsmi_exception.AmdSmiLibraryException: + enumeration_info = { + "drm_render": "N/A", + "drm_card": "N/A", + "hsa_id": "N/A", + "hip_id": "N/A", + "hip_uuid": "N/A", + } + + # now store all the fields exactly once: + if enumeration_info['drm_render'] == "N/A": + self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render']) + else: + self.logger.store_output(args.gpu, 'render', + f"renderD{enumeration_info['drm_render']}") + if enumeration_info['drm_card'] == "N/A": + self.logger.store_output(args.gpu, 'card', enumeration_info['drm_card']) + else: + self.logger.store_output(args.gpu, 'card', + f"card{enumeration_info['drm_card']}") + self.logger.store_output(args.gpu, 'hsa_id', enumeration_info['hsa_id']) + self.logger.store_output(args.gpu, 'hip_id', enumeration_info['hip_id']) + self.logger.store_output(args.gpu, 'hip_uuid', enumeration_info['hip_uuid']) + + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output() + + + def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None): + """Get Static information for target cpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + cpu (device_handle, optional): device_handle for target device. Defaults to None. + + Returns: + None: Print output via AMDSMILogger to destination + """ + + if cpu: + args.cpu = cpu + if interface_ver: + args.interface_ver = interface_ver + + # Store cpu args that are applicable to the current platform + curr_platform_cpu_args = ["smu", "interface_ver"] + curr_platform_cpu_values = [args.smu, args.interface_ver] + + # If no cpu options are passed, return all available args + if not any(curr_platform_cpu_values): + for arg in curr_platform_cpu_args: + setattr(args, arg, True) + + # Handle multiple CPUs + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, + self.logger, + self.static_cpu) + if handled_multiple_cpus: + return # This function is recursive + args.cpu = device_handle + + # Get cpu id for logging + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + logging.debug(f"Static Arg information for CPU {cpu_id} on {self.helpers.os_info()}") + + static_dict = {} + if self.logger.is_json_format(): + static_dict['cpu'] = int(cpu_id) + + if args.smu: + try: + smu = amdsmi_interface.amdsmi_get_cpu_smu_fw_version(args.cpu) + static_dict["smu"] = {"FW_VERSION" : f"{smu['smu_fw_major_ver_num']}." + f"{smu['smu_fw_minor_ver_num']}.{smu['smu_fw_debug_ver_num']}"} + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["smu"] = "N/A" + logging.debug("Failed to get SMU FW for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.interface_ver: + static_dict["interface_version"] = {} + try: + intf_ver = amdsmi_interface.amdsmi_get_cpu_hsmp_proto_ver(args.cpu) + static_dict["interface_version"]["proto version"] = intf_ver + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["interface_version"]["proto version"] = "N/A" + logging.debug("Failed to get proto version for cpu %s | %s", cpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + if not self.logger.is_json_format(): + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + else: + self.logger.store_cpu_json_output.append(static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, + limit=None, driver=None, ras=None, board=None, numa=None, vram=None, + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, + soc_pstate=None, xgmi_plpd=None, process_isolation=None, clock=None): + """Get Static information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + current_platform_args (list): gpu supported platform arguments + current_platform_values (list): gpu supported platform values for each argument + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + asic (bool, optional): Value override for args.asic. Defaults to None. + bus (bool, optional): Value override for args.bus. Defaults to None. + vbios (bool, optional): Value override for args.vbios. Defaults to None. + limit (bool, optional): Value override for args.limit. Defaults to None. + driver (bool, optional): Value override for args.driver. Defaults to None. + ras (bool, optional): Value override for args.ras. Defaults to None. + board (bool, optional): Value override for args.board. Defaults to None. + numa (bool, optional): Value override for args.numa. Defaults to None. + vram (bool, optional): Value override for args.vram. Defaults to None. + cache (bool, optional): Value override for args.cache. Defaults to None. + partition (bool, optional): Value override for args.partition. Defaults to None. + dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None. + fb_info (bool, optional): Value override for args.fb_info. Defaults to None. + num_vf (bool, optional): Value override for args.num_vf. Defaults to None. + soc_pstate (bool, optional): Value override for args.soc_pstate. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. + Returns: + None: Print output via AMDSMILogger to destination + """ + + if gpu: + args.gpu = gpu + if asic: + args.asic = asic + if bus: + args.bus = bus + if vbios: + args.vbios = vbios + if board: + args.board = board + if driver: + args.driver = driver + if ras: + args.ras = ras + if vram: + args.vram = vram + if cache: + args.cache = cache + if process_isolation: + args.process_isolation = process_isolation + if partition: + args.partition = partition + if clock: + args.clock = clock + + # args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list + if args.clock == []: + args.clock = True + + # Store args that are applicable to the current platform (default arguments) + current_platform_args = ["asic", "bus", "vbios", "driver", "ras", + "vram", "cache", "board", "process_isolation", + "clock"] + current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras, + args.vram, args.cache, args.board, args.process_isolation, + args.clock] + + # amd-smi static default arguments: + # Exclude args that are not applicable to the current platform, + # but allow output if argument is passed. + # + # Note: Partition is a special case, it is no longer an amd-smi static + # default argument. + # Reason: Reading current_compute_partition may momentarily wake the + # GPU up. This is due to reading XCD registers, which is expected + # behavior. Changing partitions is not a trivial operation, + # current_compute_partition SYSFS controls this action. + if args.partition: + current_platform_args += ["partition"] + current_platform_values += [args.partition] + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + if self.helpers.is_linux() and self.helpers.is_baremetal(): + if limit: + args.limit = limit + if soc_pstate: + args.soc_pstate = soc_pstate + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd + current_platform_args += ["ras", "limit", "soc_pstate", "xgmi_plpd"] + current_platform_values += [args.ras, args.limit, args.soc_pstate, args.xgmi_plpd] + + if self.helpers.is_linux() and not self.helpers.is_virtual_os(): + if numa: + args.numa = numa + current_platform_args += ["numa"] + current_platform_values += [args.numa] + + if self.helpers.is_hypervisor(): + if dfc_ucode: + args.dfc_ucode = dfc_ucode + if fb_info: + args.fb_info = fb_info + if num_vf: + args.num_vf = num_vf + current_platform_args += ["dfc_ucode", "fb_info", "num_vf"] + current_platform_values += [args.dfc_ucode, args.fb_info, args.num_vf] + + if not any(current_platform_values): + for arg in current_platform_args: + setattr(args, arg, True) + + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.static_gpu) + if handled_multiple_gpus: + return # This function is recursive + args.gpu = device_handle + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + logging.debug("=====================================================================") + logging.debug(f"Static Arg information for GPU {gpu_id} on {self.helpers.os_info()}") + logging.debug(f"Function args: {args}") + logging.debug(f"Current platform args: {current_platform_args}") + logging.debug(f"Current platform values: {current_platform_values}") + logging.debug("=====================================================================") + + # Populate static dictionary for each enabled argument + static_dict = {} + if self.logger.is_json_format(): + static_dict['gpu'] = int(gpu_id) + if args.asic: + asic_dict = { + "market_name" : "N/A", + "vendor_id" : "N/A", + "vendor_name" : "N/A", + "subvendor_id" : "N/A", + "device_id" : "N/A", + "subsystem_id" : "N/A", + "rev_id" : "N/A", + "asic_serial" : "N/A", + "oam_id" : "N/A", + "num_compute_units" : "N/A", + "target_graphics_version" : "N/A" + } + + try: + asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu) + for key, value in asic_info.items(): + asic_dict[key] = value + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['asic'] = asic_dict + if args.bus: + bus_info = { + 'bdf': "N/A", + 'max_pcie_width': "N/A", + 'max_pcie_speed': "N/A", + 'pcie_levels': "N/A", + 'pcie_interface_version': "N/A", + 'slot_type': "N/A" + } + + try: + bus_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + bus_info['bdf'] = "N/A" + logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static'] + bus_info['max_pcie_width'] = pcie_static['max_pcie_width'] + bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed'] + bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version'] + bus_info['slot_type'] = pcie_static['slot_type'] + if bus_info['max_pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1) + else: + pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000) + + bus_info['max_pcie_speed'] = pcie_speed_GTs_value + + if bus_info['pcie_interface_version'] > 0: + bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}" + + # Set the unit for pcie_speed + pcie_speed_unit ='GT/s' + if self.logger.is_human_readable_format(): + bus_info['max_pcie_speed'] = f"{bus_info['max_pcie_speed']} {pcie_speed_unit}" + + if self.logger.is_json_format(): + bus_info['max_pcie_speed'] = {"value" : bus_info['max_pcie_speed'], + "unit" : pcie_speed_unit} + + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + pcie_info = amdsmi_interface.amdsmi_get_gpu_pci_bandwidth(args.gpu) + num_supported = pcie_info['transfer_rate']['num_supported'] + if num_supported != 0: + bus_info['pcie_levels'] = {} + for level in range(0, num_supported): + speed = str(self.helpers.convert_SI_unit(float(pcie_info['transfer_rate']['frequency'][level]), AMDSMIHelpers.SI_Unit.NANO)) + " GT/s" + width = str(pcie_info['lanes'][level]) + level_values = (speed, width) + bus_info['pcie_levels'].update({str(level): level_values}) + else: + bus_info['pcie_levels'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pci bandwidth info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['bus'] = bus_info + if args.vbios: + try: + vbios_info = amdsmi_interface.amdsmi_get_gpu_vbios_info(args.gpu) + for key, value in vbios_info.items(): + if isinstance(value, str): + if value.strip() == '': + vbios_info[key] = "N/A" + static_dict['ifwi'] = vbios_info + # Remove boot_firmware since it's not used + del static_dict['ifwi']['boot_firmware'] + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict['ifwi'] = "N/A" + logging.debug("Failed to get vbios/ifwi info for gpu %s | %s", gpu_id, e.get_error_info()) + if 'limit' in current_platform_args: + if args.limit: + # Power limits + + power_limit_types = {} + for power_type in amdsmi_interface.AmdSmiPowerCapType: + # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase + key = power_type.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() + power_limit_types[key] = { + "max_power_limit" : "N/A", + "min_power_limit" : "N/A", + "socket_power_limit" : "N/A" + } + + try: + power_cap_types = amdsmi_interface.amdsmi_get_supported_power_cap(args.gpu) + for sensor in power_cap_types['sensor_inds']: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, sensor) + max_power_limit = power_cap_info['max_power_cap'] + max_power_limit = self.helpers.convert_SI_unit(max_power_limit, AMDSMIHelpers.SI_Unit.MICRO) + min_power_limit = power_cap_info['min_power_cap'] + min_power_limit = self.helpers.convert_SI_unit(min_power_limit, AMDSMIHelpers.SI_Unit.MICRO) + socket_power_limit = power_cap_info['power_cap'] + socket_power_limit = self.helpers.convert_SI_unit(socket_power_limit, AMDSMIHelpers.SI_Unit.MICRO) + ppt = { + "max_power_limit" : self.helpers.unit_format(self.logger, max_power_limit, 'W'), + "min_power_limit" : self.helpers.unit_format(self.logger, min_power_limit, 'W'), + "socket_power_limit" : self.helpers.unit_format(self.logger, socket_power_limit, 'W') + } + + sensor_name = power_cap_types['sensor_types'][sensor] + # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase + sensor_key = sensor_name.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() + power_limit_types[sensor_key] = ppt + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) + + # Edge temperature limits + try: + slowdown_temp_edge_limit_error = False + slowdown_temp_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) + except amdsmi_exception.AmdSmiLibraryException as e: + slowdown_temp_edge_limit_error = True + slowdown_temp_edge_limit = "N/A" + logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info()) + + if slowdown_temp_edge_limit == 0: + slowdown_temp_edge_limit_error = True + slowdown_temp_edge_limit = "N/A" + + try: + shutdown_temp_edge_limit_error = False + shutdown_temp_edge_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY) + except amdsmi_exception.AmdSmiLibraryException as e: + shutdown_temp_edge_limit_error = True + shutdown_temp_edge_limit = "N/A" + logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + + if shutdown_temp_edge_limit == 0: + shutdown_temp_edge_limit_error = True + shutdown_temp_edge_limit = "N/A" + + # Hotspot/Junction temperature limits + try: + slowdown_temp_hotspot_limit_error = False + slowdown_temp_hotspot_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) + except amdsmi_exception.AmdSmiLibraryException as e: + slowdown_temp_hotspot_limit_error = True + slowdown_temp_hotspot_limit = "N/A" + logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + shutdown_temp_hotspot_limit_error = False + shutdown_temp_hotspot_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY) + except amdsmi_exception.AmdSmiLibraryException as e: + shutdown_temp_hotspot_limit_error = True + shutdown_temp_hotspot_limit = "N/A" + logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + + + # VRAM temperature limits + try: + slowdown_temp_vram_limit_error = False + slowdown_temp_vram_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) + except amdsmi_exception.AmdSmiLibraryException as e: + slowdown_temp_vram_limit_error = True + slowdown_temp_vram_limit = "N/A" + logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + shutdown_temp_vram_limit_error = False + shutdown_temp_vram_limit = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, + amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.EMERGENCY) + except amdsmi_exception.AmdSmiLibraryException as e: + shutdown_temp_vram_limit_error = True + shutdown_temp_vram_limit = "N/A" + logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + + + # Assign units + power_unit = 'W' + temp_unit_human_readable = '\N{DEGREE SIGN}C' + temp_unit_json = 'C' + + if self.logger.is_human_readable_format(): + if not slowdown_temp_edge_limit_error: + slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {temp_unit_human_readable}" + if not slowdown_temp_hotspot_limit_error: + slowdown_temp_hotspot_limit = f"{slowdown_temp_hotspot_limit} {temp_unit_human_readable}" + if not slowdown_temp_vram_limit_error: + slowdown_temp_vram_limit = f"{slowdown_temp_vram_limit} {temp_unit_human_readable}" + if not shutdown_temp_edge_limit_error: + shutdown_temp_edge_limit = f"{shutdown_temp_edge_limit} {temp_unit_human_readable}" + if not shutdown_temp_hotspot_limit_error: + shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {temp_unit_human_readable}" + if not shutdown_temp_vram_limit_error: + shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {temp_unit_human_readable}" + + if self.logger.is_json_format(): + if not slowdown_temp_edge_limit_error: + slowdown_temp_edge_limit = {"value" : slowdown_temp_edge_limit, + "unit" : temp_unit_json} + if not slowdown_temp_hotspot_limit_error: + slowdown_temp_hotspot_limit = {"value" : slowdown_temp_hotspot_limit, + "unit" : temp_unit_json} + if not slowdown_temp_vram_limit_error: + slowdown_temp_vram_limit = {"value" : slowdown_temp_vram_limit, + "unit" : temp_unit_json} + if not shutdown_temp_edge_limit_error: + shutdown_temp_edge_limit = {"value" : shutdown_temp_edge_limit, + "unit" : temp_unit_json} + if not shutdown_temp_hotspot_limit_error: + shutdown_temp_hotspot_limit = {"value" : shutdown_temp_hotspot_limit, + "unit" : temp_unit_json} + if not shutdown_temp_vram_limit_error: + shutdown_temp_vram_limit = {"value" : shutdown_temp_vram_limit, + "unit" : temp_unit_json} + + limit_info = {} + # Power limits + limit_info['ppt0'] = power_limit_types['ppt0'] + limit_info['ppt1'] = power_limit_types['ppt1'] + + # Shutdown limits + limit_info['slowdown_edge_temperature'] = slowdown_temp_edge_limit + limit_info['slowdown_hotspot_temperature'] = slowdown_temp_hotspot_limit + limit_info['slowdown_vram_temperature'] = slowdown_temp_vram_limit + limit_info['shutdown_edge_temperature'] = shutdown_temp_edge_limit + limit_info['shutdown_hotspot_temperature'] = shutdown_temp_hotspot_limit + limit_info['shutdown_vram_temperature'] = shutdown_temp_vram_limit + static_dict['limit'] = limit_info + if args.driver: + driver_info_dict = {"name" : "N/A", + "version" : "N/A"} + + try: + driver_info = amdsmi_interface.amdsmi_get_gpu_driver_info(args.gpu) + driver_info_dict["name"] = driver_info["driver_name"] + driver_info_dict["version"] = driver_info["driver_version"] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['driver'] = driver_info_dict + if args.board: + static_dict['board'] = {"model_number": "N/A", + "product_serial": "N/A", + "fru_id": "N/A", + "product_name": "N/A", + "manufacturer_name": "N/A"} + try: + board_info = amdsmi_interface.amdsmi_get_gpu_board_info(args.gpu) + for key, value in board_info.items(): + if isinstance(value, str): + if value.strip() == '': + board_info[key] = "N/A" + static_dict['board'] = board_info + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info()) + if 'ras' in current_platform_args: + if args.ras: + ras_dict = {"eeprom_version": "N/A", + "bad_page_threshold": "N/A", + "bad_page_threshold_exceeded": "N/A", + "parity_schema" : "N/A", + "single_bit_schema" : "N/A", + "double_bit_schema" : "N/A", + "poison_schema" : "N/A", + "ecc_block_state": "N/A"} + + try: + ras_info = amdsmi_interface.amdsmi_get_gpu_ras_feature_info(args.gpu) + for key, value in ras_info.items(): + if isinstance(value, int): + if value == 65535: + logging.debug(f"Failed to get ras {key} for gpu {gpu_id}") + ras_info[key] = "N/A" + continue + if key != "eeprom_version": + if value: + ras_info[key] = "ENABLED" + else: + ras_info[key] = "DISABLED" + + ras_dict.update(ras_info) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get ras info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + ras_dict["bad_page_threshold"] = amdsmi_interface.amdsmi_get_gpu_bad_page_threshold(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get bad page threshold count for gpu %s | %s", gpu_id, e.get_error_info()) + try: + bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu) + retired_pages = 0 + if bad_page_info: + for bad_page in bad_page_info: + if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.RESERVED: + retired_pages += 1 + # default to N/A + ras_dict["bad_page_threshold_exceeded"] = "N/A" + # If this is an int, then default to False + if isinstance(ras_dict["bad_page_threshold"], int): + ras_dict["bad_page_threshold_exceeded"] = "False" + if retired_pages > ras_dict["bad_page_threshold"]: + # If there are more retired pages then set to True + ras_dict["bad_page_threshold_exceeded"] = "True" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get retired pages count for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) + ecc_block_state_dict = {} + for state in ras_states: + ecc_block_state_dict[state["block"]] = state["status"] + ras_dict["ecc_block_state"] = ecc_block_state_dict + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict["ras"] = ras_dict + if args.partition: + try: + compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + compute_partition = "N/A" + logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + memory_partition = "N/A" + logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['partition'] = {"accelerator_partition": compute_partition, + "memory_partition": memory_partition, + "partition_id": partition_id} + if 'soc_pstate' in current_platform_args: + if args.soc_pstate: + try: + policy_info = amdsmi_interface.amdsmi_get_soc_pstate(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get soc pstate policy info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['soc_pstate'] = policy_info + if 'xgmi_plpd' in current_platform_args: + if args.xgmi_plpd: + try: + policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['xgmi_plpd'] = policy_info + if 'process_isolation' in current_platform_args: + if args.process_isolation: + try: + status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + status = "Enabled" if status else "Disabled" + except amdsmi_exception.AmdSmiLibraryException as e: + status = "N/A" + logging.debug("Failed to process isolation for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['process_isolation'] = status + if 'numa' in current_platform_args: + if args.numa: + try: + numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + numa_node_number = "N/A" + logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu) + # -1 means No numa node is assigned to the GPU, so there is no numa affinity + if self.logger.is_human_readable_format() and numa_affinity == -1: + numa_affinity = "NONE" + except amdsmi_exception.AmdSmiLibraryException as e: + numa_affinity = "N/A" + logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + cpu_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.NUMA_SCOPE) + cpu_set = [f"{cpus:016X}" for cpus in cpu_set] + cpu_set = {f'cpu_list_{i}': f"{cpus}" for i, cpus in enumerate(cpu_set)} + bitmask_ranges = self.helpers.get_bitmask_ranges(cpu_set) + cpu_affinity = {} + + for key in cpu_set: + cpu_affinity[key] = { + "bitmask": cpu_set[key], + "cpu_cores_affinity" : bitmask_ranges[key] + } + + except amdsmi_exception.AmdSmiLibraryException as e: + cpu_affinity = "N/A" + logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + socket_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE) + socket_set = [f"{cpus:016X}" for cpus in socket_set] + socket_set = {f'cpu_list_{i}': f"{cpus}" for i, cpus in enumerate(socket_set)} + socket_bitmask_ranges = self.helpers.get_bitmask_ranges(socket_set) + socket_affinity = {} + for key in socket_set: + socket_affinity[key] = { + "bitmask": socket_set[key], + "cpu_cores_affinity": socket_bitmask_ranges.get(key, "N/A") + } + except amdsmi_exception.AmdSmiLibraryException as e: + socket_affinity = "N/A" + logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['numa'] = { 'node' : numa_node_number, + 'affinity' : numa_affinity, + 'cpu_affinity' : cpu_affinity, + 'socket_affinity' : socket_affinity} + if args.vram: + vram_info_dict = {"type" : "N/A", + "vendor" : "N/A", + "size" : "N/A", + "bit_width" : "N/A", + "max_bandwidth" : "N/A"} + try: + vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu) + + # Get vram type string + vram_type_enum = vram_info['vram_type'] + if vram_type_enum == amdsmi_interface.amdsmi_wrapper.AMDSMI_VRAM_TYPE__MAX: + vram_type = "GDDR7" + else: + vram_type = amdsmi_interface.amdsmi_wrapper.amdsmi_vram_type_t__enumvalues[vram_type_enum] + # Remove amdsmi enum prefix + vram_type = vram_type.replace('AMDSMI_VRAM_TYPE_', '').replace('_', '') + + # Get vram vendor string + vram_vendor = vram_info['vram_vendor'] + if "PLACEHOLDER" in vram_vendor: + vram_vendor = "N/A" + + # Assign cleaned values to vram_info_dict + vram_info_dict['type'] = vram_type + vram_info_dict['vendor'] = vram_vendor + + # Populate vram size with unit + vram_info_dict['size'] = vram_info['vram_size'] + vram_size_unit = "MB" + if self.logger.is_human_readable_format(): + vram_info_dict['size'] = f"{vram_info['vram_size']} {vram_size_unit}" + + if self.logger.is_json_format(): + vram_info_dict['size'] = {"value" : vram_info['vram_size'], + "unit" : vram_size_unit} + + # Populate bit width + vram_info_dict['bit_width'] = vram_info['vram_bit_width'] + + # Populate vram_max_bandwidth + vram_max_bw = vram_info['vram_max_bandwidth'] + vram_max_bw_unit = 'GB/s' + if self.logger.is_human_readable_format(): + vram_info_dict["max_bandwidth"] = f"{vram_max_bw} {vram_max_bw_unit if vram_max_bw != 'N/A' else ''}" + if self.logger.is_json_format(): + vram_info_dict["max_bandwidth"] = {"value" : vram_max_bw, + "unit" : vram_max_bw_unit} + + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['vram'] = vram_info_dict + if args.cache: + try: + cache_info_list = amdsmi_interface.amdsmi_get_gpu_cache_info(args.gpu)['cache'] + logging.debug(f"cache_info dictionary = {cache_info_list}") + + for index, cache_info in enumerate(cache_info_list): + new_cache_info = {"cache" : index} + new_cache_info.update(cache_info) + cache_info_list[index] = new_cache_info + + logging.debug(f"[after update] cache_info_list = {cache_info_list}") + + cache_size_unit = "KB" + if self.logger.is_human_readable_format(): + cache_info_dict_format = {} + for cache_dict in cache_info_list: + cache_index = "cache_" + str(cache_dict["cache"]) + cache_info_dict_format[cache_index] = cache_dict + + # Remove cache index from new dictionary + cache_info_dict_format[cache_index].pop("cache") + + # Add cache_size unit + cache_size = f"{cache_info_dict_format[cache_index]['cache_size']} {cache_size_unit}" + cache_info_dict_format[cache_index]["cache_size"] = cache_size + + # take cache_properties out of list -> display as string, removing brackets + cache_info_dict_format[cache_index]["cache_properties"] = ", ".join(cache_info_dict_format[cache_index]["cache_properties"]) + + cache_info_list = cache_info_dict_format + logging.debug(f"[human readable] cache_info_list = {cache_info_list}") + + # Add cache_size_unit to json output + if self.logger.is_json_format(): + for cache_dict in cache_info_list: + cache_dict["cache_size"] = {"value" : cache_dict["cache_size"], + "unit" : cache_size_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + cache_info_list = "N/A" + logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['cache_info'] = cache_info_list + # default to printing all clocks, if in current_platform_args; otherwise print specific clocks + if 'clock' in current_platform_args and (args.clock == True or isinstance(args.clock, list)): + original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices + if isinstance(args.clock, bool): + args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1'] + + if isinstance(args.clock, list): + # remove potential duplicates from list + args.clock = list(set(args.clock)) + # check that clock is valid option + if "all" in args.clock or len(args.clock) == 0: + args.clock = ['sys', 'mem', 'df', 'soc', 'dcef', 'vclk0', 'vclk1', 'dclk0', 'dclk1'] + + clk_dict = { + 'sys': "N/A", + 'mem': "N/A", + 'df': "N/A", + 'soc': "N/A", + 'dcef': "N/A", + 'vclk0': "N/A", + 'vclk1': "N/A", + 'dclk0': "N/A", + 'dclk1': "N/A", + } + for clk in list(clk_dict.keys()): + if clk not in args.clock: + del clk_dict[clk] + for clk in args.clock: + if clk in self.convert_clock_type: + clk_type_conversion = self.convert_clock_type[clk] + else: + clk_type_conversion = "N/A" + output_format = self.helpers.get_output_format() + raise AmdSmiInvalidParameterException('static', clk_type, output_format) # clk type given is bad + + try: + frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion) + # some clocks may have a sysfs file but no frequencies for whatever reason. + if len(frequencies['frequency']) == 0: + freq_dict = "N/A" + continue + freq_dict = {} + current_level = frequencies['current'] + freq_dict.update({'current_level':current_level}) + current_frequency = str(self.helpers.convert_SI_unit(frequencies['frequency'][current_level], AMDSMIHelpers.SI_Unit.MICRO)) + "MHz" + freq_dict.update({'current_frequency':current_frequency}) + freq_dict.update({'frequency_levels':{}}) + if frequencies["num_supported"] != 0: + for level in range(len(frequencies['frequency'])): + if frequencies['frequency'][level] != "N/A": + freq = str(self.helpers.convert_SI_unit(frequencies['frequency'][level], AMDSMIHelpers.SI_Unit.MICRO)) + " MHz" + freq_dict['frequency_levels'].update({f"Level {level}":freq}) + else: + freq_dict['frequency_levels'].update({f"Level {level}":"N/A"}) + else: + freq_dict = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + freq_dict = "N/A" + logging.debug("Failed to get clock info for gpu %s | %s", gpu_id, e.get_error_info()) + clk_dict[clk] = freq_dict + + static_dict['clock'] = clk_dict + else: + raise amdsmi_exception.AmdSmiParameterException(args.clock, 'list[str]') + # if original_clock_args is a boolean, set it back to the original value + if isinstance(original_clock_args, bool): + args.clock = original_clock_args + + # Convert and store output by pid for csv format + multiple_devices_csv_override = False + if self.logger.is_csv_format(): + # For NUMA data - flatten CPU affinity lists + if 'numa' in static_dict and isinstance(static_dict['numa'], dict): + numa_data = static_dict.pop('numa') + multiple_devices_csv_override = True + + # Get data + node = numa_data.get('node', 'N/A') + affinity = numa_data.get('affinity', 'N/A') + cpu_affinity = numa_data.get('cpu_affinity', {}) + socket_affinity = numa_data.get('socket_affinity', {}) + # Create a flattened row for list entry + row_dict = static_dict.copy() + + if cpu_affinity and isinstance(cpu_affinity, dict): + for cpu_list_key in cpu_affinity.keys(): + cpu_entry = cpu_affinity[cpu_list_key] + socket_entry = socket_affinity.get(cpu_list_key, {"bitmask": "N/A", "cpu_cores_affinity": "N/A"}) + row_dict.update({ + 'node': node, + 'affinity': affinity, + 'cpu_list': cpu_list_key, + 'bitmask': cpu_entry.get('bitmask'), + 'cpu_cores_affinity': cpu_entry.get('cpu_cores_affinity'), + 'socket_bitmask': socket_entry.get('bitmask'), + 'socket_cpu_cores_affinity': socket_entry.get('cpu_cores_affinity') + }) + self.logger.store_output(args.gpu, 'values', row_dict) + self.logger.store_gpu_json_output.append(row_dict) + self.logger.store_multiple_device_output() + else: + row_dict.update({ + 'node': node, + 'affinity': affinity, + 'cpu_list': 'N/A', + 'bitmask': 'N/A', + 'cpu_cores_affinity': 'N/A', + 'socket_bitmask': 'N/A', + 'socket_cpu_cores_affinity': 'N/A' + }) + self.logger.store_output(args.gpu, 'values', row_dict) + self.logger.store_gpu_json_output.append(row_dict) + # expand if ras blocks are populated + elif self.helpers.is_linux() and self.helpers.is_baremetal() and args.ras: + if isinstance(static_dict['ras']['ecc_block_state'], list): + ecc_block_dicts = static_dict['ras'].pop('ecc_block_state') + multiple_devices_csv_override = True + for ecc_block_dict in ecc_block_dicts: + for key, value in ecc_block_dict.items(): + self.logger.store_output(args.gpu, key, value) + self.logger.store_output(args.gpu, 'values', static_dict) + self.logger.store_gpu_json_output.append(static_dict) + self.logger.store_multiple_device_output() + else: + # Store values if ras has an error + self.logger.store_output(args.gpu, 'values', static_dict) + self.logger.store_gpu_json_output.append(static_dict) + if self.helpers.is_linux() and self.helpers.is_virtual_os(): + self.logger.store_output(args.gpu, 'values', static_dict) + self.logger.store_gpu_json_output.append(static_dict) + else: + self.logger.store_output(args.gpu, 'values', static_dict) + self.logger.store_gpu_json_output.append(static_dict) + elif self.logger.is_json_format(): + self.logger.store_gpu_json_output.append(static_dict) + else: + # Store values in logger.output + self.logger.store_output(args.gpu, 'values', static_dict) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def static(self, args, multiple_devices=False, gpu=None, asic=None, + bus=None, vbios=None, limit=None, driver=None, ras=None, + board=None, numa=None, vram=None, cache=None, partition=None, + dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, + interface_ver=None, soc_pstate=None, xgmi_plpd = None, process_isolation=None, + clock=None): + """Get Static information for target gpu and cpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + asic (bool, optional): Value override for args.asic. Defaults to None. + bus (bool, optional): Value override for args.bus. Defaults to None. + vbios (bool, optional): Value override for args.vbios. Defaults to None. + limit (bool, optional): Value override for args.limit. Defaults to None. + driver (bool, optional): Value override for args.driver. Defaults to None. + ras (bool, optional): Value override for args.ras. Defaults to None. + board (bool, optional): Value override for args.board. Defaults to None. + numa (bool, optional): Value override for args.numa. Defaults to None. + vram (bool, optional): Value override for args.vram. Defaults to None. + cache (bool, optional): Value override for args.cache. Defaults to None. + partition (bool, optional): Value override for args.partition. Defaults to None. + dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None. + fb_info (bool, optional): Value override for args.fb_info. Defaults to None. + num_vf (bool, optional): Value override for args.num_vf. Defaults to None. + cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. + interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None + soc_pstate (bool, optional): Value override for args.soc_pstate. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (bool, optional): Value override for args.process_isolation. Defaults to None. + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Mutually exclusive arguments + if cpu: + args.cpu = cpu + if gpu: + args.gpu = gpu + + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["smu", "interface_ver"] + for attr in cpu_attributes: + if hasattr(args, attr): + if getattr(args, attr): + cpu_args_enabled = True + break + + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", + "board", "numa", "vram", "cache", "partition", + "dfc_ucode", "fb_info", "num_vf", "soc_pstate", "xgmi_plpd", + "process_isolation", "clock"] + for attr in gpu_attributes: + if hasattr(args, attr): + if getattr(args, attr): + gpu_args_enabled = True + break + + # Handle CPU and GPU intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + # Print out all CPU and all GPU static info only if no device was specified. + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None: + if not cpu_args_enabled and not gpu_args_enabled: + args.cpu = self.cpu_handles + args.gpu = self.device_handles + + # Handle cases where the user has only specified an argument and no specific device + if args.gpu == None and gpu_args_enabled: + args.gpu = self.device_handles + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles + + if args.cpu: + self.static_cpu(args, multiple_devices, cpu, interface_ver) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.static_gpu(args, multiple_devices, gpu, asic, + bus, vbios, limit, driver, ras, + board, numa, vram, cache, partition, + dfc_ucode, fb_info, num_vf, soc_pstate, + process_isolation, clock) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None: + args.cpu = self.cpu_handles + + self.static_cpu(args, multiple_devices, cpu, interface_ver) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + args.gpu = self.device_handles + + self.logger.clear_multiple_devices_output() + self.static_gpu(args, multiple_devices, gpu, asic, + bus, vbios, limit, driver, ras, + board, numa, vram, cache, partition, + dfc_ucode, fb_info, num_vf, soc_pstate, xgmi_plpd, + process_isolation, clock) + if self.logger.is_json_format(): + self.logger.combine_arrays_to_json() + + + def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): + """ Get Firmware information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + fw_list (bool, optional): True to get list of all firmware information + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + if gpu: + args.gpu = gpu + if fw_list: + args.fw_list = fw_list + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.firmware) + if handled_multiple_gpus: + return # This function is recursive + + args.gpu = device_handle + + fw_list = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + if args.fw_list: + try: + fw_info = amdsmi_interface.amdsmi_get_fw_info(args.gpu) + + for fw_index, fw_entry in enumerate(fw_info['fw_list']): + # Change fw_name to fw_id + fw_entry['fw_id'] = fw_entry.pop('fw_name').name.replace("AMDSMI_FW_ID_", "") + fw_entry['fw_version'] = fw_entry.pop('fw_version') # popping to ensure order + + # Add custom human readable formatting + if self.logger.is_human_readable_format(): + fw_info['fw_list'][fw_index] = {f'FW {fw_index}': fw_entry} + else: + fw_info['fw_list'][fw_index] = fw_entry + + fw_list.update(fw_info) + except amdsmi_exception.AmdSmiLibraryException as e: + fw_list['fw_list'] = "N/A" + logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + # Convert and store output by pid for csv format + if self.logger.is_csv_format(): + fw_key = 'fw_list' + for fw_info_dict in fw_list[fw_key]: + for key, value in fw_info_dict.items(): + multiple_devices_csv_override = True + self.logger.store_output(args.gpu, key, value) + self.logger.store_multiple_device_output() + else: + # Store values in logger.output + self.logger.store_output(args.gpu, 'values', fw_list) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def bad_pages(self, args, multiple_devices=False, gpu=None, retired=None, pending=None, un_res=None): + """ Get bad pages information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + retired (bool, optional) - Value override for args.retired + pending (bool, optional) - Value override for args.pending/ + un_res (bool, optional) - Value override for args.un_res + + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if retired: + args.retired = retired + if pending: + args.pending = pending + if un_res: + args.un_res = un_res + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.bad_pages) + if handled_multiple_gpus: + return # This function is recursive + + args.gpu = device_handle + + # If all arguments are False, the print all bad_page information + if not any([args.retired, args.pending, args.un_res]): + args.retired = args.pending = args.un_res = True + + values_dict = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + bad_pages_not_found = "No bad pages found." + try: + bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu) + # If bad_page_info is an empty list overwrite with not found error statement + if bad_page_info == []: + bad_page_info = bad_pages_not_found + bad_page_error = True + else: + bad_page_error = False + except amdsmi_exception.AmdSmiLibraryException as e: + bad_page_info = "N/A" + bad_page_error = True + logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) + + if args.retired: + if bad_page_error: + values_dict['retired'] = bad_page_info + else: + bad_page_info_output = [] + for bad_page in bad_page_info: + if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.RESERVED: + bad_page_info_entry = {} + bad_page_info_entry["page_address"] = bad_page["page_address"] + bad_page_info_entry["page_size"] = bad_page["page_size"] + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") + bad_page_info_output.append(bad_page_info_entry) + # Remove brackets if there is only one value + if len(bad_page_info_output) == 1: + bad_page_info_output = bad_page_info_output[0] + + if bad_page_info_output == []: + values_dict['retired'] = bad_pages_not_found + else: + values_dict['retired'] = bad_page_info_output + + if args.pending: + if bad_page_error: + values_dict['pending'] = bad_page_info + else: + bad_page_info_output = [] + for bad_page in bad_page_info: + if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.PENDING: + bad_page_info_entry = {} + bad_page_info_entry["page_address"] = bad_page["page_address"] + bad_page_info_entry["page_size"] = bad_page["page_size"] + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") + bad_page_info_output.append(bad_page_info_entry) + # Remove brackets if there is only one value + if len(bad_page_info_output) == 1: + bad_page_info_output = bad_page_info_output[0] + + if bad_page_info_output == []: + values_dict['pending'] = bad_pages_not_found + else: + values_dict['pending'] = bad_page_info_output + + if args.un_res: + if bad_page_error: + values_dict['un_res'] = bad_page_info + else: + bad_page_info_output = [] + for bad_page in bad_page_info: + if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.UNRESERVABLE: + bad_page_info_entry = {} + bad_page_info_entry["page_address"] = bad_page["page_address"] + bad_page_info_entry["page_size"] = bad_page["page_size"] + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") + bad_page_info_output.append(bad_page_info_entry) + # Remove brackets if there is only one value + if len(bad_page_info_output) == 1: + bad_page_info_output = bad_page_info_output[0] + + if bad_page_info_output == []: + values_dict['un_res'] = bad_pages_not_found + else: + values_dict['un_res'] = bad_page_info_output + + # Store values in logger.output + self.logger.store_output(args.gpu, 'values', values_dict) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output() + + + def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None, + usage=None, watch=None, watch_time=None, iterations=None, power=None, + clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, + xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None, + guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None, + base_board=None, gpu_board=None): + """Get Metric information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + usage (bool, optional): Value override for args.usage. Defaults to None. + watch (Positive int, optional): Value override for args.watch. Defaults to None. + watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None. + iterations (Positive int, optional): Value override for args.iterations. Defaults to None. + power (bool, optional): Value override for args.power. Defaults to None. + clock (bool, optional): Value override for args.clock. Defaults to None. + temperature (bool, optional): Value override for args.temperature. Defaults to None. + ecc (bool, optional): Value override for args.ecc. Defaults to None. + ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. + pcie (bool, optional): Value override for args.pcie. Defaults to None. + fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. + overdrive (bool, optional): Value override for args.overdrive. Defaults to None. + perf_level (bool, optional): Value override for args.perf_level. Defaults to None. + xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. + energy (bool, optional): Value override for args.energy. Defaults to None. + mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None. + voltage (bool, optional): Value override for args.voltage. Defaults to None. + schedule (bool, optional): Value override for args.schedule. Defaults to None. + guard (bool, optional): Value override for args.guard. Defaults to None. + guest_data (bool, optional): Value override for args.guest_data. Defaults to None. + fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None. + xgmi (bool, optional): Value override for args.xgmi. Defaults to None. + throttle (bool, optional): Value override for args.throttle. Defaults to None. + + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if watch: + args.watch = watch + if watch_time: + args.watch_time = watch_time + if iterations: + args.iterations = iterations + + # Store args that are applicable to the current platform + current_platform_args = [] + current_platform_values = [] + + if not self.helpers.is_hypervisor() and not self.helpers.is_windows(): + if mem_usage: + args.mem_usage = mem_usage + current_platform_args += ["mem_usage"] + current_platform_values += [args.mem_usage] + + if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): + if usage: + args.usage = usage + if base_board: + args.base_board = base_board + if gpu_board: + args.gpu_board = gpu_board + if power: + args.power = power + if clock: + args.clock = clock + if temperature: + args.temperature = temperature + if voltage: + args.voltage = voltage + if pcie: + args.pcie = pcie + if ecc: + args.ecc = ecc + if ecc_blocks: + args.ecc_blocks = ecc_blocks + current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks", "base_board","gpu_board"] + current_platform_values += [args.usage, args.power, args.clock, + args.temperature, args.voltage, args.pcie] + current_platform_values += [args.ecc, args.ecc_blocks, args.base_board, args.gpu_board] + + if self.helpers.is_baremetal() and self.helpers.is_linux(): + if fan: + args.fan = fan + if voltage_curve: + args.voltage_curve = voltage_curve + if overdrive: + args.overdrive = overdrive + if perf_level: + args.perf_level = perf_level + if xgmi_err: + args.xgmi_err = xgmi_err + if energy: + args.energy = energy + if throttle: + args.violation = throttle + args.throttle = throttle + current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", + "xgmi_err", "energy", "throttle"] + current_platform_values += [args.fan, args.voltage_curve, args.overdrive, + args.perf_level, args.xgmi_err, args.energy, args.throttle, + ] + + if self.helpers.is_hypervisor(): + if schedule: + args.schedule = schedule + if guard: + args.guard = guard + if guest_data: + args.guest_data = guest_data + if fb_usage: + args.fb_usage = fb_usage + if xgmi: + args.xgmi = xgmi + current_platform_args += ["schedule", "guard", "guest_data", "fb_usage", "xgmi"] + current_platform_values += [args.schedule, args.guard, args.guest_data, + args.fb_usage, args.xgmi] + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Handle watch logic, will only enter this block once + if args.watch: + self.helpers.handle_watch(args=args, subcommand=self.metric_gpu, logger=self.logger) + return + + # Handle multiple GPUs + if isinstance(args.gpu, list): + if len(args.gpu) > 1: + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) + + # Store output from multiple devices + for device_handle in args.gpu: + self.metric_gpu(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + # Print multiple device output + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + # Add output to total watch output and clear multiple device output + if watching_output: + self.logger.store_watch_output(multiple_device_enabled=True) + + # Flush the watching output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + return + elif len(args.gpu) == 1: + args.gpu = args.gpu[0] + else: + raise IndexError("args.gpu should not be an empty list") + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + if args.loglevel == "DEBUG": + try: + # Get GPU Metrics table version + gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu) + gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) + logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("#1 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.get_error_info()) + + try: + # Get GPU Metrics table + gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) + logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str)) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("#2 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + + logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}") + logging.debug(f"Args: {current_platform_args}") + logging.debug(f"Values: {current_platform_values}") + + # Set the platform applicable args to True if no args are set + if not any(current_platform_values): + for arg in current_platform_args: + setattr(args, arg, True) + + # Add timestamp and store values for specified arguments + values_dict = {} + + is_partition_metrics = False # True if we get the metrics from xcp_metrics file (amdsmi_get_gpu_partition_metrics_info) + #get metric info only once per gpu, this will speed up data output + try: + # Get GPU Metrics table + gpu_metric = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("#3 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + gpu_metric = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() + + # Workaround for XCP (partition) metrics not providing num_partition in v1.9+/v1.1+ + # Provides original formatting for earlier metric versions + partition_metric_info = self.helpers._get_metric_version_and_partition_info(gpu_metric, is_partition_metrics, gpu_id, args.gpu) + num_partition = partition_metric_info['num_partition'] + + if self.logger.is_json_format(): + values_dict['gpu'] = int(gpu_id) + # Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth + if "pcie" in current_platform_args: + if args.pcie: + pcie_dict = {"width": "N/A", + "speed": "N/A", + "bandwidth": "N/A", + "replay_count" : "N/A", + "l0_to_recovery_count" : "N/A", + "replay_roll_over_count" : "N/A", + "nak_sent_count" : "N/A", + "nak_received_count" : "N/A", + "current_bandwidth_sent": "N/A", + "current_bandwidth_received": "N/A", + "max_packet_size": "N/A", + "lc_perf_other_end_recovery": "N/A"} + + try: + pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) + + pcie_dict['width'] = pcie_metric['pcie_width'] + + if pcie_metric['pcie_speed'] != "N/A": + if pcie_metric['pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) + else: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) + pcie_dict['speed'] = pcie_speed_GTs_value + + pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] + + pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] + if pcie_dict['replay_count'] == "N/A": + try: + pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) + pcie_dict['replay_count'] = pcie_replay + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get sysfs pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info()) + + pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] + pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] + pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] + pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] + pcie_dict['lc_perf_other_end_recovery'] = pcie_metric['pcie_lc_perf_other_end_recovery_count'] + + pcie_speed_unit = 'GT/s' + pcie_bw_unit = 'Mb/s' + if self.logger.is_human_readable_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" + if self.logger.is_json_format(): + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = {"value" : pcie_dict['speed'], + "unit" : pcie_speed_unit} + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], + "unit" : pcie_bw_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) + sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] + received = pcie_bw['received'] * pcie_bw['max_pkt_sz'] + + bw_unit = "Mb/s" + packet_size_unit = "B" + if sent > 0: + sent = sent // 1024 // 1024 + if received > 0: + received = received // 1024 // 1024 + + if self.logger.is_human_readable_format(): + sent = f"{sent} {bw_unit}" + received = f"{received} {bw_unit}" + pcie_bw['max_pkt_sz'] = f"{pcie_bw['max_pkt_sz']} {packet_size_unit}" + if self.logger.is_json_format(): + sent = {"value" : sent, + "unit" : bw_unit} + received = {"value" : received, + "unit" : bw_unit} + pcie_bw['max_pkt_sz'] = {"value" : pcie_bw['max_pkt_sz'], + "unit" : packet_size_unit} + + pcie_dict['current_bandwidth_sent'] = sent + pcie_dict['current_bandwidth_received'] = received + pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) + + if "usage" in current_platform_args: + if args.usage: + try: + engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu) + logging.debug(f"engine_usage dictionary = {engine_usage}") + + # TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity + engine_usage['vcn_activity'] = gpu_metric['vcn_activity'] + engine_usage['jpeg_activity'] = gpu_metric['jpeg_activity'] + engine_usage['gfx_busy_inst'] = "N/A" + engine_usage['jpeg_busy'] = "N/A" + engine_usage['vcn_busy'] = "N/A" + + if num_partition != "N/A": + # these are one after another, in order to display each in sub-sections + new_xcp_dict = {} + for current_xcp in range(num_partition): + new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.gfx_busy_inst'][current_xcp] + engine_usage['gfx_busy_inst'] = new_xcp_dict + + new_xcp_dict = {} + for current_xcp in range(num_partition): + new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.jpeg_busy'][current_xcp] + engine_usage['jpeg_busy'] = new_xcp_dict + + new_xcp_dict = {} + for current_xcp in range(num_partition): + new_xcp_dict[f"xcp_{current_xcp}"] = gpu_metric['xcp_stats.vcn_busy'][current_xcp] + engine_usage['vcn_busy'] = new_xcp_dict + + logging.debug(f"After updates to engine_usage dictionary = {engine_usage}") + + for key, value in engine_usage.items(): + activity_unit = '%' + if self.logger.is_human_readable_format(): + if isinstance(value, list): + for index, activity in enumerate(value): + if activity != "N/A": + engine_usage[key][index] = f"{activity} {activity_unit}" + # Convert list to a string for human readable format + engine_usage[key] = '[' + ", ".join(engine_usage[key]) + ']' + elif isinstance(value, dict): + for k, v in value.items(): + for index, activity in enumerate(v): + if activity != "N/A": + value[k][index] = f"{activity} {activity_unit}" + # Convert list to a string for human readable format + value[k] = '[' + ", ".join(value[k]) + ']' + elif value != "N/A": + engine_usage[key] = f"{value} {activity_unit}" + if self.logger.is_json_format(): + if isinstance(value, list): + for index, activity in enumerate(value): + if activity != "N/A": + engine_usage[key][index] = {"value" : activity, + "unit" : activity_unit} + elif isinstance(value, dict): + for k, v in value.items(): + for index, activity in enumerate(v): + if activity != "N/A": + value[k][index] = {"value" : activity, + "unit" : activity_unit} + elif value != "N/A": + engine_usage[key] = {"value" : value, + "unit" : activity_unit} + + values_dict['usage'] = engine_usage + except Exception as e: + values_dict['usage'] = "N/A" + logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e) + if "power" in current_platform_args: + if args.power: + power_dict = {'socket_power': "N/A", + 'gfx_voltage': "N/A", + 'soc_voltage': "N/A", + 'mem_voltage': "N/A", + 'throttle_status': "N/A", + 'power_management': "N/A"} + + try: + voltage_unit = "mV" + power_unit = "W" + power_info = amdsmi_interface.amdsmi_get_power_info(args.gpu) + for key, value in power_info.items(): + if "voltage" in key: + power_info[key] = self.helpers.unit_format(self.logger, + value, + voltage_unit) + elif key == "socket_power": + power_info[key] = self.helpers.unit_format(self.logger, + value, + power_unit) + + power_dict['socket_power'] = power_info['socket_power'] + power_dict['gfx_voltage'] = power_info['gfx_voltage'] + power_dict['soc_voltage'] = power_info['soc_voltage'] + power_dict['mem_voltage'] = power_info['mem_voltage'] + + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + is_power_management_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(args.gpu) + if is_power_management_enabled: + power_dict['power_management'] = "ENABLED" + else: + power_dict['power_management'] = "DISABLED" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + power_dict['throttle_status'] = "N/A" + throttle_status = gpu_metric['throttle_status'] + if throttle_status != "N/A": + if throttle_status: + power_dict['throttle_status'] = "THROTTLED" + else: + power_dict['throttle_status'] = "UNTHROTTLED" + except Exception as e: + logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e) + + values_dict['power'] = power_dict + if "clock" in current_platform_args: + if args.clock: + # Populate Skeleton output with N/A + clocks = {} + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS): + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clocks["mem_0"] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clocks["fclk_0"] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clocks["socclk_0"] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clock_unit = "MHz" + + # Populate clock values from gpu_metrics_info + # Populate GFX clock values + try: + current_gfx_clocks = gpu_metric["current_gfxclks"] + if current_gfx_clocks != "N/A": + for clock_index, current_gfx_clock in enumerate(current_gfx_clocks): + # If the current clock is N/A then nothing else applies + if current_gfx_clock == "N/A": + continue + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger, + current_gfx_clock, + clock_unit) + # Populate clock locked status + if gpu_metric["gfxclk_lock_status"] != "N/A": + gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag + if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag: + clocks[gfx_index]["clk_locked"] = "ENABLED" + else: + clocks[gfx_index]["clk_locked"] = "DISABLED" + except Exception as e: + logging.debug("Failed to get current_gfxclks for gpu %s | %s", gpu_id, e) + + # Populate MEM clock value + try: + current_mem_clock = gpu_metric["current_uclk"] # single value + if current_mem_clock != "N/A": + clocks["mem_0"]["clk"] = self.helpers.unit_format(self.logger, + current_mem_clock, + clock_unit) + except Exception as e: + logging.debug("Failed to get current_uclk for gpu %s | %s", gpu_id, e) + + # Populate VCLK clock values + try: + current_vclk_clocks = gpu_metric["current_vclk0s"] + # If the current vclk clocks are not available, we cannot proceed further + if current_vclk_clocks != "N/A": + for clock_index, current_vclk_clock in enumerate(current_vclk_clocks): + # If the current clock is N/A then nothing else applies + if current_vclk_clock == "N/A": + continue + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_vclk_clock, + clock_unit) + except Exception as e: + logging.debug("Failed to get current_vclk0s for gpu %s | %s", gpu_id, e) + + # Populate DCLK clock values + try: + current_dclk_clocks = gpu_metric["current_dclk0s"] + # If the current dclk clocks are not available, we cannot proceed further + if current_dclk_clocks != "N/A": + for clock_index, current_dclk_clock in enumerate(current_dclk_clocks): + # If the current clock is N/A then nothing else applies + if current_dclk_clock == "N/A": + continue + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_dclk_clock, + clock_unit) + except Exception as e: + logging.debug("Failed to get current_dclk0s for gpu %s | %s", gpu_id, e) + + # Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq + try: + frequency_dict = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, amdsmi_interface.AmdSmiClkType.DF) + current_fclk_clock = frequency_dict['frequency'][frequency_dict['current']] + current_fclk_clock = self.helpers.convert_SI_unit(current_fclk_clock, self.helpers.SI_Unit.MICRO) + clocks["fclk_0"]["clk"] = self.helpers.unit_format(self.logger, + current_fclk_clock, + clock_unit) + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + logging.debug("Failed to get fclk info for gpu %s | %s", gpu_id, e) + + # Populate SOCCLK clock value + try: + current_socclk_clock = gpu_metric["current_socclk"] + # If the current socclk clocks are not available, we cannot proceed further + if current_socclk_clock != "N/A": + clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger, + current_socclk_clock, + clock_unit) + except KeyError as e: + logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e) + + + # Populate the max and min clock values from sysfs. + # Min and Max values are per clock type, not per clock engine. + # Populate the deep sleep value from amdsmi_get_clock_info + + # GFX min and max clocks + try: + gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.GFX) + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS): + gfx_index = f"gfx_{clock_index}" + + if clocks[gfx_index]["clk"] == "N/A": + # if the current clock is N/A then we shouldn't populate the max and min values + continue + clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger, + gfx_clock_info_dict["min_clk"], + clock_unit) + clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger, + gfx_clock_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks[gfx_index]["deep_sleep"] = gfx_clock_info_dict["clk_deep_sleep"] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e) + + # MEM min and max clocks + try: + mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.MEM) + # if the current clock is N/A then we shouldn't populate the max and min values + if clocks["mem_0"]["clk"] != "N/A": + clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger, + mem_clock_info_dict["min_clk"], + clock_unit) + clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger, + mem_clock_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks["mem_0"]["deep_sleep"] = mem_clock_info_dict["clk_deep_sleep"] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e) + + # VCLK min and max clocks + try: + # Retrieve clock information for VCLK0 (Video Clock 0) + vclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.VCLK0) + + # Iterate through the maximum number of VCLK clocks supported + for index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + vclk_index = f"vclk_{index}" # Construct the index key for the clock + + # Check if the current clock value is not "N/A" + if clocks[vclk_index]["clk"] != "N/A": + # Format and assign the minimum clock value for the current VCLK + clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + vclk_clock_info_dict["min_clk"], + clock_unit) + # Format and assign the maximum clock value for the current VCLK + clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + vclk_clock_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks[vclk_index]["deep_sleep"] = vclk_clock_info_dict["clk_deep_sleep"] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + # Log a debug message if retrieving VCLK clock information fails + logging.debug("Failed to get vclk clock info for gpu %s | %s", gpu_id, e) + + # DCLK min and max clocks + try: + # Retrieve clock information for DCLK0 (Display Clock 0) + dclk_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.DCLK0) + + # Iterate through the maximum number of DCLK clocks supported + for index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + dclk_index = f"dclk_{index}" # Construct the index key for the clock + + # Check if the current clock value is not "N/A" + if clocks[dclk_index]["clk"] != "N/A": + # Format and assign the minimum clock value for the current DCLK + clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + dclk_clock_info_dict["min_clk"], + clock_unit) + # Format and assign the maximum clock value for the current DCLK + clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + dclk_clock_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks[dclk_index]["deep_sleep"] = dclk_clock_info_dict["clk_deep_sleep"] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + logging.debug("Failed to get dclk clock info for gpu %s | %s", gpu_id, e) + + # FCLK min and max clocks + try: + fclk_clk_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.DF) + # if the current clock is N/A then we shouldn't populate the max and min values + if clocks["fclk_0"]["clk"] != "N/A": + clocks["fclk_0"]["min_clk"] = self.helpers.unit_format(self.logger, + fclk_clk_info_dict["min_clk"], + clock_unit) + clocks["fclk_0"]["max_clk"] = self.helpers.unit_format(self.logger, + fclk_clk_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks["fclk_0"]["deep_sleep"] = fclk_clk_info_dict["clk_deep_sleep"] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get fclk info for gpu %s | %s", gpu_id, e.get_error_info()) + + # SOCCLK min and max clocks + try: + socclk_clk_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.SOC) + # if the current clock is N/A then we shouldn't populate the max and min values + if clocks["socclk_0"]["clk"] != "N/A": + clocks["socclk_0"]["min_clk"] = self.helpers.unit_format(self.logger, + socclk_clk_info_dict["min_clk"], + clock_unit) + clocks["socclk_0"]["max_clk"] = self.helpers.unit_format(self.logger, + socclk_clk_info_dict["max_clk"], + clock_unit) + # Add the clk_deep_sleep + clocks["socclk_0"]["deep_sleep"] = socclk_clk_info_dict["clk_deep_sleep"] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get socclk info for gpu %s | %s", gpu_id, e.get_error_info()) + + # Iterate over each clock and its data to determine if deep sleep is enabled + # based on the comparison between the current clock value and the minimum clock value. + for clock, clock_data in clocks.items(): + clk_value = 0 + min_clk_value = 0 + try: + clk = clock_data["clk"] + min_clk = clock_data["min_clk"] + if clk == "N/A" or min_clk == "N/A": + continue + # Extract numeric value if clk/min_clk is a dict, else use as is + if isinstance(clk, dict): + clk_value = int(clk.get("value", 0)) + else: + if isinstance(clk, str): + clk_value = int(str(clk).split()[0]) + else: + clk_value = int(clk) + if isinstance(min_clk, dict): + min_clk_value = int(min_clk.get("value", 0)) + else: + if isinstance(min_clk, str): + min_clk_value = int(str(min_clk).split()[0]) + else: + min_clk_value = int(min_clk) + # If the clk value is less than the min_clk value, then deep sleep is enabled + if clk_value < min_clk_value: + clock_data["deep_sleep"] = "ENABLED" + else: + clock_data["deep_sleep"] = "DISABLED" + except Exception as e: + logging.debug("Failed to get deep sleep status for gpu %s | %s", gpu_id, e) + + values_dict['clock'] = clocks + if "temperature" in current_platform_args: + if args.temperature: + try: + temperature_edge_current = amdsmi_interface.amdsmi_get_temp_metric( + args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + except amdsmi_exception.AmdSmiLibraryException as e: + temperature_edge_current = "N/A" + logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + temperature_edge_limit = amdsmi_interface.amdsmi_get_temp_metric( + args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) + except amdsmi_exception.AmdSmiLibraryException as e: + temperature_edge_limit = "N/A" + logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info()) + + # If edge limit is reporting 0 then set the current edge temp to N/A + if temperature_edge_limit == 0: + temperature_edge_current = "N/A" + + try: + temperature_hotspot_current = amdsmi_interface.amdsmi_get_temp_metric( + args.gpu, amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + except amdsmi_exception.AmdSmiLibraryException as e: + temperature_hotspot_current = "N/A" + logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + temperature_vram_current = amdsmi_interface.amdsmi_get_temp_metric( + args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + except amdsmi_exception.AmdSmiLibraryException as e: + temperature_vram_current = "N/A" + logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info()) + + temperatures = {'edge': temperature_edge_current, + 'hotspot': temperature_hotspot_current, + 'mem': temperature_vram_current} + + temp_unit_human_readable = '\N{DEGREE SIGN}C' + temp_unit_json = 'C' + for temperature_key, temperature_value in temperatures.items(): + if 'N/A' not in str(temperature_value): + if self.logger.is_human_readable_format(): + temperatures[temperature_key] = f"{temperature_value} {temp_unit_human_readable}" + if self.logger.is_json_format(): + temperatures[temperature_key] = {"value" : temperature_value, + "unit" : temp_unit_json} + + values_dict['temperature'] = temperatures + + # Since pcie bw may increase based on frequent metrics calls, we add it to the output here, but the populate the values first + if "pcie" in current_platform_args: + if args.pcie: + values_dict['pcie'] = pcie_dict + + if "gpu_board" in current_platform_args: + if args.gpu_board: + gpu_board_temp_dict = {} + gpu_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32 + ] + for type in gpu_board_temp_types: + type_name = type.name.replace("GPUBOARD_", "") + try: + gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if gpu_board_temp_holder != "N/A": + gpu_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, + gpu_board_temp_holder, + '\N{DEGREE SIGN}C') + else: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get gpu_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + # if every value is N/A, then we don't want to display the values unless explicitly told to + # all args_list being True indicates that this gpu_board is not explicitly called itself + args_list = [getattr(args, arg) for arg in current_platform_args] + if all(value == "N/A" for value in gpu_board_temp_dict.values()) and all(arg == True for arg in args_list): + gpu_board_temp_dict = {} + if gpu_board_temp_dict: + values_dict['gpu_board'] = {'temperature':gpu_board_temp_dict} + if "base_board" in current_platform_args: + if args.base_board: + base_board_temp_dict = {} + base_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC + ] + for type in base_board_temp_types: + type_name = type.name.replace("BASEBOARD_", "") + try: + base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if base_board_temp_holder != "N/A": + + base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, + base_board_temp_holder, + '\N{DEGREE SIGN}C') + else: + base_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + base_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get base_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + # if every value is N/A, then we don't want to display the values unless explicitly told to + # all args_list being True indicates that this base_board is not explicitly called itself + args_list = [getattr(args, arg) for arg in current_platform_args] + if all(value == "N/A" for value in base_board_temp_dict.values()) and all(arg == True for arg in args_list): + base_board_temp_dict = {} + if base_board_temp_dict: + values_dict['base_board'] = {'temperature':base_board_temp_dict} + if "ecc" in current_platform_args: + if args.ecc: + ecc_count = {} + try: + ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) + ecc_count['total_correctable_count'] = ecc_count.pop('correctable_count') + ecc_count['total_uncorrectable_count'] = ecc_count.pop('uncorrectable_count') + ecc_count['total_deferred_count'] = ecc_count.pop('deferred_count') + except amdsmi_exception.AmdSmiLibraryException as e: + ecc_count['total_correctable_count'] = "N/A" + ecc_count['total_uncorrectable_count'] = "N/A" + ecc_count['cache_correctable_count'] = "N/A" + ecc_count['cache_uncorrectable_count'] = "N/A" + logging.debug("Failed to get total ecc count for gpu %s | %s", gpu_id, e.get_error_info()) + + if ecc_count['total_correctable_count'] != "N/A": + # Get the UMC error count for getting total cache correctable errors + umc_block = amdsmi_interface.AmdSmiGpuBlock['UMC'] + try: + umc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, umc_block) + ecc_count['cache_correctable_count'] = ecc_count['total_correctable_count'] - umc_count['correctable_count'] + ecc_count['cache_uncorrectable_count'] = ecc_count['total_uncorrectable_count'] - umc_count['uncorrectable_count'] + except amdsmi_exception.AmdSmiLibraryException as e: + ecc_count['cache_correctable_count'] = "N/A" + ecc_count['cache_uncorrectable_count'] = "N/A" + logging.debug("Failed to get cache ecc count for gpu %s at block %s | %s", gpu_id, umc_block, e.get_error_info()) + + values_dict['ecc'] = ecc_count + if "ecc_blocks" in current_platform_args: + if args.ecc_blocks: + ecc_dict = {} + sysfs_blocks = ["UMC", "SDMA", "GFX", "MMHUB", "PCIE_BIF", "HDP", "XGMI_WAFL"] + try: + ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) + for state in ras_states: + # Only add enabled blocks that are also in sysfs + if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED.name: + gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']] + # if the blocks are uncountable do not add them at all. + if gpu_block.name in sysfs_blocks: + try: + ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block) + ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'], + 'uncorrectable_count' : ecc_count['uncorrectable_count'], + 'deferred_count' : ecc_count['deferred_count']} + except amdsmi_exception.AmdSmiLibraryException as e: + ecc_dict[state['block']] = {'correctable_count' : "N/A", + 'uncorrectable_count' : "N/A", + 'deferred_count' : "N/A"} + logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info()) + + values_dict['ecc_blocks'] = ecc_dict + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['ecc_blocks'] = "N/A" + logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info()) + if "fan" in current_platform_args: + if args.fan: + fan_dict = {"speed" : "N/A", + "max" : "N/A", + "rpm" : "N/A", + "usage" : "N/A"} + + try: + fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(args.gpu, 0) + fan_dict["speed"] = fan_speed + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get fan speed for gpu %s | %s", args.gpu, e.get_error_info()) + + try: + fan_max = amdsmi_interface.amdsmi_get_gpu_fan_speed_max(args.gpu, 0) + fan_usage = "N/A" + if fan_max > 0 and fan_dict["speed"] != "N/A": + fan_usage = round((float(fan_speed) / float(fan_max)) * 100, 2) + fan_usage_unit = '%' + if self.logger.is_human_readable_format(): + fan_usage = f"{fan_usage} {fan_usage_unit}" + if self.logger.is_json_format(): + fan_usage = {"value" : fan_usage, + "unit" : fan_usage_unit} + fan_dict["max"] = fan_max + fan_dict["usage"] = fan_usage + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get fan max speed for gpu %s | %s", args.gpu, e.get_error_info()) + + try: + fan_rpm = amdsmi_interface.amdsmi_get_gpu_fan_rpms(args.gpu, 0) + fan_dict["rpm"] = fan_rpm + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get fan rpms for gpu %s | %s", args.gpu, e.get_error_info()) + + values_dict["fan"] = fan_dict + if "voltage_curve" in current_platform_args: + if args.voltage_curve: + # Populate N/A values per voltage point + voltage_point_dict = {} + for point in range(amdsmi_interface.AMDSMI_NUM_VOLTAGE_CURVE_POINTS): + voltage_point_dict[f'point_{point}_frequency'] = "N/A" + voltage_point_dict[f'point_{point}_voltage'] = "N/A" + + try: + od_volt = amdsmi_interface.amdsmi_get_gpu_od_volt_info(args.gpu) + logging.debug(f"OD Voltage info: {od_volt}") + except amdsmi_exception.AmdSmiLibraryException as e: + od_volt = "N/A" # Value not used, but needs to not be a dict + logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) + + # Populate voltage point values + for point in range(amdsmi_interface.AMDSMI_NUM_VOLTAGE_CURVE_POINTS): + if isinstance(od_volt, dict): + logging.debug(f"point_{point} frequency: {od_volt['curve.vc_points'][point]['frequency']}") + logging.debug(f"point_{point} voltage: {od_volt['curve.vc_points'][point]['voltage']}") + frequency = int(od_volt["curve.vc_points"][point]['frequency'] / 1000000) + voltage = int(od_volt["curve.vc_points"][point]['voltage']) + else: + frequency = "N/A" + voltage = "N/A" + + if frequency == 0: + frequency = "N/A" + + if voltage == 0: + voltage = "N/A" + + if frequency != "N/A": + frequency = self.helpers.unit_format(self.logger, frequency, "Mhz") + + if voltage != "N/A": + voltage = self.helpers.unit_format(self.logger, voltage, "mV") + + voltage_point_dict[f'point_{point}_frequency'] = frequency + voltage_point_dict[f'point_{point}_voltage'] = voltage + + values_dict['voltage_curve'] = voltage_point_dict + if "overdrive" in current_platform_args: + if args.overdrive: + try: + overdrive_level = amdsmi_interface.amdsmi_get_gpu_overdrive_level(args.gpu) + od_unit = '%' + values_dict['overdrive'] = self.helpers.unit_format(self.logger, overdrive_level, od_unit) + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['overdrive'] = "N/A" + logging.debug("Failed to get gpu overdrive level for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + mem_overdrive_level = amdsmi_interface.amdsmi_get_gpu_mem_overdrive_level(args.gpu) + od_unit = '%' + values_dict['mem_overdrive'] = self.helpers.unit_format(self.logger, mem_overdrive_level, od_unit) + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['mem_overdrive'] = "N/A" + logging.debug("Failed to get mem overdrive level for gpu %s | %s", gpu_id, e.get_error_info()) + if "perf_level" in current_platform_args: + if args.perf_level: + try: + perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu) + values_dict['perf_level'] = perf_level + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['perf_level'] = "N/A" + logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info()) + if "xgmi_err" in current_platform_args: + if args.xgmi_err: + try: + xgmi_err_status = amdsmi_interface.amdsmi_gpu_xgmi_error_status(args.gpu) + values_dict['xgmi_err'] = amdsmi_interface.amdsmi_wrapper.amdsmi_xgmi_status_t__enumvalues[xgmi_err_status] + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['xgmi_err'] = "N/A" + logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info()) + if "voltage" in current_platform_args: + if args.voltage: + voltage_dict = {} + all_voltage = { + "vddboard": amdsmi_interface.AmdSmiVoltageType.VDDBOARD + } + for volt_type, volt_metric in all_voltage.items(): + try: + voltage = amdsmi_interface.amdsmi_get_gpu_volt_metric(args.gpu, volt_metric, amdsmi_interface.AmdSmiVoltageMetric.CURRENT) + if voltage == 0: + voltage = "N/A" + voltage_dict[volt_type] = self.helpers.unit_format(self.logger, voltage, "mV") + except amdsmi_exception.AmdSmiLibraryException as e: + voltage_dict[volt_type] = "N/A" + logging.debug("Failed to get voltage for gpu %s | %s", gpu_id, e.get_error_info()) + values_dict['voltage'] = voltage_dict + if "energy" in current_platform_args: + if args.energy: + try: + energy_dict = amdsmi_interface.amdsmi_get_energy_count(args.gpu) + + energy = round(energy_dict["energy_accumulator"] * energy_dict["counter_resolution"], 3) + energy /= 1000000 + energy = round(energy, 3) + + energy_unit = 'J' + if self.logger.is_human_readable_format(): + energy = f"{energy} {energy_unit}" + if self.logger.is_json_format(): + energy = {"value" : energy, + "unit" : energy_unit} + + values_dict['energy'] = {"total_energy_consumption" : energy} + except amdsmi_interface.AmdSmiLibraryException as e: + values_dict['energy'] = "N/A" + logging.debug("Failed to get energy usage for gpu %s | %s", args.gpu, e.get_error_info()) + if "mem_usage" in current_platform_args: + if args.mem_usage: + memory_usage = {'total_vram': "N/A", + 'used_vram': "N/A", + 'free_vram': "N/A", + 'total_visible_vram': "N/A", + 'used_visible_vram': "N/A", + 'free_visible_vram': "N/A", + 'total_gtt': "N/A", + 'used_gtt': "N/A", + 'free_gtt': "N/A"} + + # Total VRAM + try: + total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) + memory_usage['total_vram'] = total_vram // (1024*1024) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + total_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) + memory_usage['total_visible_vram'] = total_visible_vram // (1024*1024) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + total_gtt = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) + memory_usage['total_gtt'] = total_gtt // (1024*1024) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) + + # Used VRAM + try: + used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) + memory_usage['used_vram'] = used_vram // (1024*1024) + + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + used_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) + memory_usage['used_visible_vram'] = used_visible_vram // (1024*1024) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + used_gtt = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) + memory_usage['used_gtt'] = used_gtt // (1024*1024) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) + + # Free VRAM + if memory_usage['total_vram'] != "N/A" and memory_usage['used_vram'] != "N/A": + memory_usage['free_vram'] = memory_usage['total_vram'] - memory_usage['used_vram'] + + if memory_usage['total_visible_vram'] != "N/A" and memory_usage['used_visible_vram'] != "N/A": + memory_usage['free_visible_vram'] = memory_usage['total_visible_vram'] - memory_usage['used_visible_vram'] + + if memory_usage['total_gtt'] != "N/A" and memory_usage['used_gtt'] != "N/A": + memory_usage['free_gtt'] = memory_usage['total_gtt'] - memory_usage['used_gtt'] + + memory_unit = 'MB' + for key, value in memory_usage.items(): + if value != "N/A": + if self.logger.is_human_readable_format(): + memory_usage[key] = f"{value} {memory_unit}" + if self.logger.is_json_format(): + memory_usage[key] = {"value" : value, + "unit" : memory_unit} + + values_dict['mem_usage'] = memory_usage + if "throttle" in current_platform_args: + if args.throttle: + throttle_status = { + # Current values - counter/accumulated + 'accumulation_counter': "N/A", + 'prochot_accumulated': "N/A", + 'ppt_accumulated': "N/A", + 'socket_thermal_accumulated': "N/A", + 'vr_thermal_accumulated': "N/A", + 'hbm_thermal_accumulated': "N/A", + 'gfx_clk_below_host_limit_accumulated': "N/A", # deprecated + 'gfx_clk_below_host_limit_power_accumulated': "N/A", + 'gfx_clk_below_host_limit_thermal_accumulated': "N/A", + 'total_gfx_clk_below_host_limit_accumulated': "N/A", + 'low_utilization_accumulated': "N/A", + + # violation status values - active/not active + 'prochot_violation_status': "N/A", + 'ppt_violation_status': "N/A", + 'socket_thermal_violation_status': "N/A", + 'vr_thermal_violation_status': "N/A", + 'hbm_thermal_violation_status': "N/A", + 'gfx_clk_below_host_limit_violation_status': "N/A", # deprecated + 'gfx_clk_below_host_limit_power_violation_status': "N/A", + 'gfx_clk_below_host_limit_thermal_violation_status': "N/A", + 'total_gfx_clk_below_host_limit_violation_status': "N/A", + 'low_utilization_violation_status': "N/A", + + # violation activity values - percent + 'prochot_violation_activity': "N/A", + 'ppt_violation_activity': "N/A", + 'socket_thermal_violation_activity': "N/A", + 'vr_thermal_violation_activity': "N/A", + 'hbm_thermal_violation_activity': "N/A", + 'gfx_clk_below_host_limit_violation_activity': "N/A", # deprecated + 'gfx_clk_below_host_limit_power_violation_activity': "N/A", + 'gfx_clk_below_host_limit_thermal_violation_activity': "N/A", + 'total_gfx_clk_below_host_limit_violation_activity': "N/A", + 'low_utilization_violation_activity': "N/A", + } + + try: + violation_status = amdsmi_interface.amdsmi_get_violation_status(args.gpu) + throttle_status['accumulation_counter'] = violation_status['acc_counter'] + throttle_status['prochot_accumulated'] = violation_status['acc_prochot_thrm'] + throttle_status['ppt_accumulated'] = violation_status['acc_ppt_pwr'] + throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm'] + throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm'] + throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm'] + throttle_status['gfx_clk_below_host_limit_accumulated'] = violation_status['acc_gfx_clk_below_host_limit'] #deprecated + throttle_status['gfx_clk_below_host_limit_power_accumulated'] = self.helpers.build_xcp_dict('acc_gfx_clk_below_host_limit_pwr', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_thermal_accumulated'] = self.helpers.build_xcp_dict('acc_gfx_clk_below_host_limit_thm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_accumulated'] = self.helpers.build_xcp_dict('acc_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['low_utilization_accumulated'] = self.helpers.build_xcp_dict('acc_low_utilization', violation_status, num_partition) + throttle_status['prochot_violation_status'] = self.helpers.build_xcp_dict('active_prochot_thrm', violation_status, num_partition) + throttle_status['ppt_violation_status'] = self.helpers.build_xcp_dict('active_ppt_pwr', violation_status, num_partition) + throttle_status['socket_thermal_violation_status'] = self.helpers.build_xcp_dict('active_socket_thrm', violation_status, num_partition) + throttle_status['vr_thermal_violation_status'] = self.helpers.build_xcp_dict('active_vr_thrm', violation_status, num_partition) + throttle_status['hbm_thermal_violation_status'] = self.helpers.build_xcp_dict('active_hbm_thrm', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_violation_status'] = self.helpers.build_xcp_dict('active_gfx_clk_below_host_limit', violation_status, num_partition) # deprecated + throttle_status['gfx_clk_below_host_limit_power_violation_status'] = self.helpers.build_xcp_dict('active_gfx_clk_below_host_limit_pwr', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_thermal_violation_status'] = self.helpers.build_xcp_dict('active_gfx_clk_below_host_limit_thm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_violation_status'] = self.helpers.build_xcp_dict('active_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['low_utilization_violation_status'] = self.helpers.build_xcp_dict('active_low_utilization', violation_status, num_partition) + throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm'] + throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr'] + throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm'] + throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm'] + throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm'] + throttle_status['gfx_clk_below_host_limit_violation_activity'] = violation_status['per_gfx_clk_below_host_limit'] # deprecated + throttle_status['gfx_clk_below_host_limit_power_violation_activity'] = self.helpers.build_xcp_dict('per_gfx_clk_below_host_limit_pwr', violation_status, num_partition) + throttle_status['gfx_clk_below_host_limit_thermal_violation_activity'] = self.helpers.build_xcp_dict('per_gfx_clk_below_host_limit_thm', violation_status, num_partition) + throttle_status['total_gfx_clk_below_host_limit_violation_activity'] = self.helpers.build_xcp_dict('per_gfx_clk_below_host_limit_total', violation_status, num_partition) + throttle_status['low_utilization_violation_activity'] = self.helpers.build_xcp_dict('per_low_utilization', violation_status, num_partition) + + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['throttle'] = throttle_status + logging.debug("Failed to get violation status' for gpu %s | %s", gpu_id, e.get_error_info()) + + for key, value in throttle_status.items(): + + activity_unit = '' + if "_activity" in key: + activity_unit = '%' + + if self.logger.is_human_readable_format(): + if isinstance(value, (list, dict)): + for k, v in value.items(): + for index, activity in enumerate(v): + value[k][index] = self.helpers.unit_format(self.logger, activity, activity_unit) + value[k] = '[' + ", ".join(value[k]) + ']' + elif value != "N/A": + throttle_status[key] = self.helpers.unit_format(self.logger, value, activity_unit) + if self.logger.is_json_format(): + if isinstance(value, (list, dict)): + for k, v in value.items(): + for index, activity in enumerate(v): + value[k][index] = self.helpers.unit_format(self.logger, activity, activity_unit) + elif value != "N/A": + throttle_status[key] = self.helpers.unit_format(self.logger, value, activity_unit) + values_dict['throttle'] = throttle_status + + # Store timestamp first if watching_output is enabled + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, 'values', values_dict) + self.logger.store_gpu_json_output.append(values_dict) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + if not self.logger.is_json_format(): + self.logger.print_output(watching_output=watching_output) + + if watching_output: # End of single gpu add to watch_output + self.logger.store_watch_output(multiple_device_enabled=False) + + + def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None, + cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None, + cpu_pwr_svi_telemetry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, + cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None, + cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None, + cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None): + """Get Metric information for target cpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None + cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None. + cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None. + cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None + cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None + cpu_pwr_svi_telemetry_rails (list, optional): value override for args.cpu_pwr_svi_telemetry_rails. Defaults to None + cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None + cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None + cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None + cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None + cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None + cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None + cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None + cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None + cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None + cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None + + Returns: + None: Print output via AMDSMILogger to destination + """ + + if cpu: + args.cpu = cpu + if cpu_power_metrics: + args.cpu_power_metrics = cpu_power_metrics + if cpu_prochot: + args.cpu_prochot = cpu_prochot + if cpu_freq_metrics: + args.cpu_freq_metrics = cpu_freq_metrics + if cpu_c0_res: + args.cpu_c0_res = cpu_c0_res + if cpu_lclk_dpm_level: + args.cpu_lclk_dpm_level = cpu_lclk_dpm_level + if cpu_pwr_svi_telemetry_rails: + args.cpu_pwr_svi_telemetry_rails = cpu_pwr_svi_telemetry_rails + if cpu_io_bandwidth: + args.cpu_io_bandwidth = cpu_io_bandwidth + if cpu_xgmi_bandwidth: + args.cpu_xgmi_bandwidth = cpu_xgmi_bandwidth + if cpu_metrics_ver: + args.cpu_metrics_ver = cpu_metrics_ver + if cpu_metrics_table: + args.cpu_metrics_table = cpu_metrics_table + if cpu_socket_energy: + args.cpu_socket_energy = cpu_socket_energy + if cpu_ddr_bandwidth: + args.cpu_ddr_bandwidth = cpu_ddr_bandwidth + if cpu_temp: + args.cpu_temp = cpu_temp + if cpu_dimm_temp_range_rate: + args.cpu_dimm_temp_range_rate = cpu_dimm_temp_range_rate + if cpu_dimm_pow_consumption: + args.cpu_dimm_pow_consumption = cpu_dimm_pow_consumption + if cpu_dimm_thermal_sensor: + args.cpu_dimm_thermal_sensor = cpu_dimm_thermal_sensor + + #store cpu args that are applicable to the current platform + curr_platform_cpu_args = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics", + "cpu_c0_res", "cpu_lclk_dpm_level", "cpu_pwr_svi_telemetry_rails", + "cpu_io_bandwidth", "cpu_xgmi_bandwidth", "cpu_metrics_ver", + "cpu_metrics_table", "cpu_socket_energy", "cpu_ddr_bandwidth", + "cpu_temp", "cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption", + "cpu_dimm_thermal_sensor"] + curr_platform_cpu_values = [args.cpu_power_metrics, args.cpu_prochot, args.cpu_freq_metrics, + args.cpu_c0_res, args.cpu_lclk_dpm_level, args.cpu_pwr_svi_telemetry_rails, + args.cpu_io_bandwidth, args.cpu_xgmi_bandwidth, args.cpu_metrics_ver, + args.cpu_metrics_table, args.cpu_socket_energy, args.cpu_ddr_bandwidth, + args.cpu_temp, args.cpu_dimm_temp_range_rate, args.cpu_dimm_pow_consumption, + args.cpu_dimm_thermal_sensor] + + # Handle No CPU passed (fall back as this should be defined in metric()) + if args.cpu == None: + args.cpu = self.cpu_handles + + if not any(curr_platform_cpu_values): + for arg in curr_platform_cpu_args: + if arg not in("cpu_lclk_dpm_level", "cpu_io_bandwidth", "cpu_xgmi_bandwidth", + "cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"): + setattr(args, arg, True) + + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, + self.logger, + self.metric_cpu) + if handled_multiple_cpus: + return # This function is recursive + args.cpu = device_handle + # get cpu id for logging + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + logging.debug(f"Metric Arg information for CPU {cpu_id} on {self.helpers.os_info()}") + + static_dict = {} + if self.logger.is_json_format(): + static_dict['cpu'] = int(cpu_id) + if args.cpu_power_metrics: + static_dict["power_metrics"] = {} + try: + soc_pow = amdsmi_interface.amdsmi_get_cpu_socket_power(args.cpu) + static_dict["power_metrics"]["socket power"] = soc_pow + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket power"] = "N/A" + logging.debug("Failed to get socket power for cpu %s | %s", cpu_id, e.get_error_info()) + + try: + soc_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap(args.cpu) + static_dict["power_metrics"]["socket power limit"] = soc_pwr_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket power limit"] = "N/A" + logging.debug("Failed to get socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) + + try: + soc_max_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap_max(args.cpu) + static_dict["power_metrics"]["socket max power limit"] = soc_max_pwr_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket max power limit"] = "N/A" + logging.debug("Failed to get max socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_prochot: + static_dict["prochot"] = {} + try: + proc_status = amdsmi_interface.amdsmi_get_cpu_prochot_status(args.cpu) + static_dict["prochot"]["prochot_status"] = proc_status + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["prochot"]["prochot_status"] = "N/A" + logging.debug("Failed to get prochot status for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_freq_metrics: + static_dict["freq_metrics"] = {} + try: + fclk_mclk = amdsmi_interface.amdsmi_get_cpu_fclk_mclk(args.cpu) + static_dict["freq_metrics"]["fclkmemclk"] = fclk_mclk + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["fclkmemclk"] = "N/A" + logging.debug("Failed to get current fclkmemclk freq for cpu %s | %s", cpu_id, e.get_error_info()) + + try: + cclk_freq = amdsmi_interface.amdsmi_get_cpu_cclk_limit(args.cpu) + static_dict["freq_metrics"]["cclkfreqlimit"] = cclk_freq + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["cclkfreqlimit"] = "N/A" + logging.debug("Failed to get current cclk freq for cpu %s | %s", cpu_id, e.get_error_info()) + + try: + soc_cur_freq_limit = amdsmi_interface.amdsmi_get_cpu_socket_current_active_freq_limit(args.cpu) + static_dict["freq_metrics"]["soc_current_active_freq_limit"] = soc_cur_freq_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["soc_current_active_freq_limit"] = "N/A" + logging.debug("Failed to get socket current freq limit for cpu %s | %s", cpu_id, e.get_error_info()) + + try: + soc_freq_range = amdsmi_interface.amdsmi_get_cpu_socket_freq_range(args.cpu) + static_dict["freq_metrics"]["soc_freq_range"] = soc_freq_range + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["soc_freq_range"] = "N/A" + logging.debug("Failed to get socket freq range for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_c0_res: + static_dict["c0_residency"] = {} + try: + residency = amdsmi_interface.amdsmi_get_cpu_socket_c0_residency(args.cpu) + static_dict["c0_residency"]["residency"] = residency + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["c0_residency"]["residency"] = "N/A" + logging.debug("Failed to get C0 residency for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_lclk_dpm_level: + static_dict["socket_dpm"] = {} + try: + dpm_val = amdsmi_interface.amdsmi_get_cpu_socket_lclk_dpm_level(args.cpu, + args.cpu_lclk_dpm_level[0][0]) + static_dict["socket_dpm"]["dpml_level_range"] = dpm_val + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["socket_dpm"]["dpml_level_range"] = "N/A" + logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_pwr_svi_telemetry_rails: + static_dict["svi_telemetry_all_rails"] = {} + try: + power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu) + static_dict["svi_telemetry_all_rails"]["power"] = power + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["c0_residency"]["residency"] = "N/A" + logging.debug("Failed to get svi telemetry all rails for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_io_bandwidth: + static_dict["io_bandwidth"] = {} + try: + bandwidth = amdsmi_interface.amdsmi_get_cpu_current_io_bandwidth(args.cpu, + int(args.cpu_io_bandwidth[0][0]), + args.cpu_io_bandwidth[0][1].upper()) + static_dict["io_bandwidth"]["band_width"] = bandwidth + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["io_bandwidth"]["band_width"] = "N/A" + logging.debug("Failed to get io bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_xgmi_bandwidth: + static_dict["xgmi_bandwidth"] = {} + try: + bandwidth = amdsmi_interface.amdsmi_get_cpu_current_xgmi_bw(args.cpu, + int(args.cpu_xgmi_bandwidth[0][0]), + args.cpu_xgmi_bandwidth[0][1].upper()) + static_dict["xgmi_bandwidth"]["band_width"] = bandwidth + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["xgmi_bandwidth"]["band_width"] = "N/A" + logging.debug("Failed to get xgmi bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_metrics_ver: + static_dict["metric_version"] = {} + try: + version = amdsmi_interface.amdsmi_get_hsmp_metrics_table_version(args.cpu) + static_dict["metric_version"]["version"] = version + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metric_version"]["version"] = "N/A" + logging.debug("Failed to get metrics table version for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_metrics_table: + static_dict["metrics_table"] = {} + try: + cpu_fam = amdsmi_interface.amdsmi_get_cpu_family() + static_dict["metrics_table"]["cpu_family"] = cpu_fam + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["cpu_family"] = "N/A" + logging.debug("Failed to get cpu family | %s", e.get_error_info()) + try: + cpu_mod = amdsmi_interface.amdsmi_get_cpu_model() + static_dict["metrics_table"]["cpu_model"] = cpu_mod + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["cpu_model"] = "N/A" + logging.debug("Failed to get cpu model | %s", e.get_error_info()) + try: + cpu_metrics_table = amdsmi_interface.amdsmi_get_hsmp_metrics_table(args.cpu) + static_dict["metrics_table"]["response"] = cpu_metrics_table + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["response"] = "N/A" + logging.debug("Failed to get metrics table for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_socket_energy: + static_dict["socket_energy"] = {} + try: + energy = amdsmi_interface.amdsmi_get_cpu_socket_energy(args.cpu) + static_dict["socket_energy"]["response"] = energy + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["socket_energy"]["response"] = "N/A" + logging.debug("Failed to get socket energy for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_ddr_bandwidth: + static_dict["ddr_bandwidth"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_ddr_bw(args.cpu) + static_dict["ddr_bandwidth"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["ddr_bandwidth"]["response"] = "N/A" + logging.debug("Failed to get ddr bandwdith for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_temp: + static_dict["cpu_temp"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_socket_temperature(args.cpu) + static_dict["cpu_temp"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["cpu_temp"]["response"] = "N/A" + logging.debug("Failed to get cpu temperature for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_temp_range_rate: + static_dict["dimm_temp_range_rate"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(args.cpu, args.cpu_dimm_temp_range_rate[0][0]) + static_dict["dimm_temp_range_rate"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_temp_range_rate"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_pow_consumption: + static_dict["dimm_pow_consumption"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_power_consumption(args.cpu, args.cpu_dimm_pow_consumption[0][0]) + static_dict["dimm_pow_consumption"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_pow_consumption"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_thermal_sensor: + static_dict["dimm_thermal_sensor"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_thermal_sensor(args.cpu, args.cpu_dimm_thermal_sensor[0][0]) + static_dict["dimm_thermal_sensor"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_thermal_sensor"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + if not self.logger.is_json_format(): + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + else: + self.logger.store_cpu_json_output.append(static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def metric_core(self, args, multiple_devices=False, core=None, core_boost_limit=None, + core_curr_active_freq_core_limit=None, core_energy=None): + """Get Static information for target core + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None + core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None + core_energy (bool, optional): Value override for args.core_energy. Defaults to None + Returns: + None: Print output via AMDSMILogger to destination + """ + if core: + args.core = core + if core_boost_limit: + args.core_boost_limit = core_boost_limit + if core_curr_active_freq_core_limit: + args.core_curr_active_freq_core_limit = core_curr_active_freq_core_limit + if core_energy: + args.core_energy = core_energy + + #store core args that are applicable to the current platform + curr_platform_core_args = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] + curr_platform_core_values = [args.core_boost_limit, args.core_curr_active_freq_core_limit, args.core_energy] + + # Handle No cores passed + if args.core == None: + args.core = self.core_handles + + if not any(curr_platform_core_values): + for arg in curr_platform_core_args: + setattr(args, arg, True) + + handled_multiple_cores, device_handle = self.helpers.handle_cores(args, + self.logger, + self.metric_core) + if handled_multiple_cores: + return # This function is recursive + args.core = device_handle + # get core id for logging + core_id = self.helpers.get_core_id_from_device_handle(args.core) + logging.debug(f"Static Arg information for Core {core_id} on {self.helpers.os_info()}") + + static_dict = {} + if self.logger.is_json_format(): + static_dict['core'] = int(core_id) + if args.core_boost_limit: + static_dict["boost_limit"] ={} + + try: + core_boost_limit = amdsmi_interface.amdsmi_get_cpu_core_boostlimit(args.core) + static_dict["boost_limit"]["value"] = core_boost_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["boost_limit"]["value"] = "N/A" + logging.debug("Failed to get core boost limit for core %s | %s", core_id, e.get_error_info()) + if args.core_curr_active_freq_core_limit: + static_dict["curr_active_freq_core_limit"] = {} + + try: + freq = amdsmi_interface.amdsmi_get_cpu_core_current_freq_limit(args.core) + static_dict["curr_active_freq_core_limit"]["value"] = freq + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["curr_active_freq_core_limit"]["value"] = "N/A" + logging.debug("Failed to get current active frequency core for core %s | %s", core_id, e.get_error_info()) + if args.core_energy: + static_dict["core_energy"] ={} + try: + energy = amdsmi_interface.amdsmi_get_cpu_core_energy(args.core) + static_dict["core_energy"]["value"] = energy + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["core_energy"]["value"] = "N/A" + logging.debug("Failed to get core energy for core %s | %s", core_id, e.get_error_info()) + + multiple_devices_csv_override = False + if not self.logger.is_json_format(): + self.logger.store_core_output(args.core, 'values', static_dict) + else: + self.logger.store_core_json_output.append(static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, + usage=None, watch=None, watch_time=None, iterations=None, power=None, + clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, + xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None, + guard=None, guest_data=None, fb_usage=None, xgmi=None, + cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, + cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemetry_rails=None, + cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, cpu_metrics_ver=None, + cpu_metrics_table=None, cpu_socket_energy=None, cpu_ddr_bandwidth=None, + cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None, + cpu_dimm_thermal_sensor=None, + core=None, core_boost_limit=None, core_curr_active_freq_core_limit=None, + core_energy=None, throttle=None, base_board=None, gpu_board=None): + """Get Metric information for target gpu + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + usage (bool, optional): Value override for args.usage. Defaults to None. + watch (Positive int, optional): Value override for args.watch. Defaults to None. + watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None. + iterations (Positive int, optional): Value override for args.iterations. Defaults to None. + power (bool, optional): Value override for args.power. Defaults to None. + clock (bool, optional): Value override for args.clock. Defaults to None. + temperature (bool, optional): Value override for args.temperature. Defaults to None. + ecc (bool, optional): Value override for args.ecc. Defaults to None. + ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. + pcie (bool, optional): Value override for args.pcie. Defaults to None. + fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. + overdrive (bool, optional): Value override for args.overdrive. Defaults to None. + perf_level (bool, optional): Value override for args.perf_level. Defaults to None. + xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. + energy (bool, optional): Value override for args.energy. Defaults to None. + mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None. + voltage (bool, optional): Value override for args.voltage. Defaults to None. + schedule (bool, optional): Value override for args.schedule. Defaults to None. + guard (bool, optional): Value override for args.guard. Defaults to None. + guest_data (bool, optional): Value override for args.guest_data. Defaults to None. + fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None. + xgmi (bool, optional): Value override for args.xgmi. Defaults to None. + + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None + cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None. + cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None. + cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None + cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None + cpu_pwr_svi_telemetry_rails (list, optional): value override for args.cpu_pwr_svi_telemetry_rails. Defaults to None + cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None + cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None + cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None + cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None + cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None + cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None + cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None + cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None + cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None + cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None + + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None + core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None + core_energy (bool, optional): Value override for args.core_energy. Defaults to None + + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments + # Mutually exclusive args + if gpu: + args.gpu = gpu + if cpu: + args.cpu = cpu + if core: + args.core = core + + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", + "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", + "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule", + "guard", "guest_data", "fb_usage", "xgmi", "throttle", "base_board", "gpu_board"] + for attr in gpu_attributes: + if hasattr(args, attr): + if getattr(args, attr): + gpu_args_enabled = True + break + + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics", "cpu_c0_res", + "cpu_lclk_dpm_level", "cpu_pwr_svi_telemetry_rails", "cpu_io_bandwidth", + "cpu_xgmi_bandwidth", "cpu_metrics_ver", "cpu_metrics_table", + "cpu_socket_energy", "cpu_ddr_bandwidth", "cpu_temp", "cpu_dimm_temp_range_rate", + "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"] + for attr in cpu_attributes: + if hasattr(args, attr): + if getattr(args, attr): + cpu_args_enabled = True + break + + # Check if a Core argument has been set + core_args_enabled = False + core_attributes = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] + for attr in core_attributes: + if hasattr(args, attr): + if getattr(args, attr): + core_args_enabled = True + break + + # Handle CPU and GPU driver intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + + logging.debug("gpu_args_enabled: %s, cpu_args_enabled: %s, core_args_enabled: %s", + gpu_args_enabled, cpu_args_enabled, core_args_enabled) + logging.debug("args.gpu: %s, args.cpu: %s, args.core: %s", args.gpu, args.cpu, args.core) + + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None and args.core == None: + # If no args are set, print out all CPU, GPU, and Core metrics info + if not gpu_args_enabled and not cpu_args_enabled and not core_args_enabled: + args.cpu = self.cpu_handles + args.gpu = self.device_handles + args.core = self.core_handles + + # Handle cases where the user has only specified an argument and no specific device + if args.gpu == None and gpu_args_enabled: + args.gpu = self.device_handles + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles + if args.core == None and core_args_enabled: + args.core = self.core_handles + + # Print out CPU first + if args.cpu: + self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot, + cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level, + cpu_pwr_svi_telemetry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth, + cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy, + cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate, + cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.metric_core(args, multiple_devices, core, core_boost_limit, + core_curr_active_freq_core_limit, core_energy) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.metric_gpu(args, multiple_devices, watching_output, gpu, + usage, watch, watch_time, iterations, power, + clock, temperature, ecc, ecc_blocks, pcie, + fan, voltage_curve, overdrive, perf_level, + xgmi_err, energy, mem_usage, voltage, schedule, + guard, guest_data, fb_usage, xgmi, throttle, + base_board, gpu_board) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None and args.core == None: + # If no args are set, print out all CPU and Core metrics info + if not cpu_args_enabled and not core_args_enabled: + args.cpu = self.cpu_handles + args.core = self.core_handles + + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles + if args.core == None and core_args_enabled: + args.core = self.core_handles + + if args.cpu: + self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot, + cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level, + cpu_pwr_svi_telemetry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth, + cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy, + cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate, + cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.metric_core(args, multiple_devices, core, core_boost_limit, + core_curr_active_freq_core_limit, core_energy) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + args.gpu = self.device_handles + + self.logger.clear_multiple_devices_output() + self.metric_gpu(args, multiple_devices, watching_output, gpu, + usage, watch, watch_time, iterations, power, + clock, temperature, ecc, ecc_blocks, pcie, + fan, voltage_curve, overdrive, perf_level, + xgmi_err, energy, mem_usage, voltage, schedule, throttle, + base_board, gpu_board) + if self.logger.is_json_format(): + self.logger.combine_arrays_to_json() + + + def process(self, args, multiple_devices=False, watching_output=False, + gpu=None, general=None, engine=None, pid=None, name=None, + watch=None, watch_time=None, iterations=None): + """Get Process Information from the target GPU + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + general (bool, optional): Value override for args.general. Defaults to None. + engine (bool, optional): Value override for args.engine. Defaults to None. + pid (Positive int, optional): Value override for args.pid. Defaults to None. + name (str, optional): Value override for args.name. Defaults to None. + watch (Positive int, optional): Value override for args.watch. Defaults to None. + watch_time (Positive int, optional): Value override for args.watch_time. Defaults to None. + iterations (Positive int, optional): Value override for args.iterations. Defaults to None. + + Raises: + IndexError: Index error if gpu list is empty + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if general: + args.general = general + if engine: + args.engine = engine + if pid: + args.pid = pid + if name: + args.name = name + if watch: + args.watch = watch + if watch_time: + args.watch_time = watch_time + if iterations: + args.iterations = iterations + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + # Handle watch logic, will only enter this block once + if args.watch: + self.helpers.handle_watch(args=args, subcommand=self.process, logger=self.logger) + return + + # Handle multiple GPUs + if isinstance(args.gpu, list): + if len(args.gpu) > 1: + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) + + # Store output from multiple devices + for device_handle in args.gpu: + self.process(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + # Print multiple device output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + # Add output to total watch output and clear multiple device output + if watching_output: + self.logger.store_watch_output(multiple_device_enabled=True) + + # Flush the watching output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + return + elif len(args.gpu) == 1: + args.gpu = args.gpu[0] + else: + raise IndexError("args.gpu should not be an empty list") + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + # Populate initial processes + try: + process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) + raise e + + filtered_process_values = [] + for process_info in process_list: + process_info = { + "name": process_info["name"], + "pid": process_info["pid"], + "memory_usage": { + "gtt_mem": process_info["memory_usage"]["gtt_mem"], + "cpu_mem": process_info["memory_usage"]["cpu_mem"], + "vram_mem": process_info["memory_usage"]["vram_mem"], + }, + "mem_usage": process_info["mem"], + "usage": { + "gfx": process_info["engine_usage"]["gfx"], + "enc": process_info["engine_usage"]["enc"], + }, + "cu_occupancy": process_info["cu_occupancy"], + "evicted_time": process_info["evicted_time"] + } + + engine_usage_unit = "ns" + memory_usage_unit = "B" + evicted_time_unit = "ms" + + if self.logger.is_human_readable_format(): + process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage']) + for usage_metric in process_info['memory_usage']: + process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric]) + memory_usage_unit = "" + + process_info['mem_usage'] = self.helpers.unit_format(self.logger, + process_info['mem_usage'], + memory_usage_unit) + + process_info['evicted_time'] = self.helpers.unit_format(self.logger, + process_info['evicted_time'], + evicted_time_unit) + + for usage_metric in process_info['usage']: + process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger, + process_info['usage'][usage_metric], + engine_usage_unit) + + for usage_metric in process_info['memory_usage']: + process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger, + process_info['memory_usage'][usage_metric], + memory_usage_unit) + + filtered_process_values.append({'process_info': process_info}) + + if not filtered_process_values: + process_info = "N/A" + logging.debug("Failed to detect any process on gpu %s", gpu_id) + filtered_process_values.append({'process_info': process_info}) + + # Arguments will filter the populated processes + # General and Engine to expose process_info values + if args.general or args.engine: + for process_info in filtered_process_values: + if not process_info['process_info'] == "N/A": + if args.general and args.engine: + del process_info['process_info']['memory_usage'] + elif args.general: + del process_info['process_info']['memory_usage'] + del process_info['process_info']['usage'] # Used in engine + elif args.engine: + del process_info['process_info']['memory_usage'] + del process_info['process_info']['mem_usage'] # Used in general + + # Filter out non specified pids + if args.pid: + process_pids = [] + for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + continue + pid = str(process_info['process_info']['pid']) + if str(args.pid) == pid: + process_pids.append(process_info) + filtered_process_values = process_pids + + # Filter out non specified process names + if args.name: + process_names = [] + for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + continue + process_name = str(process_info['process_info']['name']).lower() + if str(args.name).lower() == process_name: + process_names.append(process_info) + filtered_process_values = process_names + + # If the name or pid args filter processes out then insert an N/A placeholder + if not filtered_process_values: + filtered_process_values.append({'process_info': "N/A"}) + + logging.debug(f"Process Info for GPU {gpu_id} | {filtered_process_values}") + + for index, process in enumerate(filtered_process_values): + if process['process_info'] == "N/A": + filtered_process_values[index]['process_info'] = "No running processes detected" + + if self.logger.is_json_format(): + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, 'process_list', filtered_process_values) + + if self.logger.is_human_readable_format(): + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + # When we print out process_info we remove the index + # The removal is needed only for human readable process format to align with Host + for index, process in enumerate(filtered_process_values): + self.logger.store_output(args.gpu, f'process_info_{index}', process['process_info']) + + multiple_devices_csv_override = False + if self.logger.is_csv_format(): + multiple_devices_csv_override = True + for process in filtered_process_values: + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, 'process_info', process['process_info']) + self.logger.store_multiple_device_output() + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + multiple_devices = multiple_devices or multiple_devices_csv_override + self.logger.print_output(multiple_device_enabled=multiple_devices, watching_output=watching_output) + + if watching_output: # End of single gpu add to watch_output + self.logger.store_watch_output(multiple_device_enabled=multiple_devices) + + + def profile(self, args): + """Not applicable to linux baremetal""" + print('Not applicable to linux baremetal') + + + def event(self, args, gpu=None): + """ Get event information for target gpus + + Args: + args (Namespace): argparser args to pass to subcommand + gpu (device_handle, optional): device_handle for target device. Defaults to None. + + Return: + stdout event information for target gpus + """ + if args.gpu: + gpu = args.gpu + + if gpu == None: + args.gpu = self.device_handles + + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + + print('EVENT LISTENING:\n') + print('Press q and hit ENTER when you want to stop.') + self.stop = False + threads = [] + for device_handle in range(len(args.gpu)): + x = threading.Thread(target=self._event_thread, args=(self, device_handle)) + threads.append(x) + x.start() + + previous_sigterm_handler = signal.getsignal(signal.SIGTERM) + system_exit_exc = None + signal.signal(signal.SIGTERM, self._event_sigterm_handler) + try: + while True: + try: + user_input = input() + except EOFError: + self.stop = True + break + except KeyboardInterrupt: + self.stop = True + break + + if self.stop: + break + + if user_input == 'q': + print("Escape Sequence Detected; Exiting") + self.stop = True + break + except SystemExit as exc: + system_exit_exc = exc + finally: + self.stop = True + for thread in threads: + thread.join() + signal.signal(signal.SIGTERM, previous_sigterm_handler) + + if system_exit_exc is not None: + raise system_exit_exc + + + def _event_sigterm_handler(self, signum, frame): + self.stop = True + raise SystemExit(128 + signum) + + + def topology(self, args, multiple_devices=False, gpu=None, access=None, + weight=None, hops=None, link_type=None, numa_bw=None, + coherent=None, atomics=None, dma=None, bi_dir=None): + """ Get topology information for target gpus + params: + args - argparser args to pass to subcommand + multiple_devices (bool) - True if checking for multiple devices + gpu (device_handle) - device_handle for target device + access (bool) - Value override for args.access + weight (bool) - Value override for args.weight + hops (bool) - Value override for args.hops + type (bool) - Value override for args.type + numa_bw (bool) - Value override for args.numa_bw + coherent (bool) - Value override for args.coherent + atomics (bool) - Value override for args.atomics + dma (bool) - Value override for args.dma + bi_dir (bool) - Value override for args.bi_dir + return: + Nothing + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if access: + args.access = access + if weight: + args.weight = weight + if hops: + args.hops = hops + if link_type: + args.link_type = link_type + if numa_bw: + args.numa_bw = numa_bw + if coherent: + args.coherent = coherent + if atomics: + args.atomics = atomics + if dma: + args.dma = dma + if bi_dir: + args.bi_dir = bi_dir + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + + # Handle all args being false + if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw, + args.coherent, args.atomics, args.dma, args.bi_dir]): + args.access = args.weight = args.hops = args.link_type= args.numa_bw = \ + args.coherent = args.atomics = args.dma = args.bi_dir = True + + # Clear the table header + self.logger.table_header = ''.rjust(12) + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + p2p_status_cache = {} + + def get_cached_p2p_status(src_gpu, dest_gpu): + #Get P2P status with caching to avoid duplicate calls + src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + key = (src_gpu_id, dest_gpu_id) + + if key not in p2p_status_cache: + try: + if src_gpu == dest_gpu: + p2p_status_cache[key] = {"cap": { + "is_iolink_coherent": -1, + "is_iolink_atomics_32bit": -1, + "is_iolink_atomics_64bit": -1, + "is_iolink_dma": -1, + "is_iolink_bi_directional": -1 + }} + else: + p2p_status_cache[key] = amdsmi_interface.amdsmi_topo_get_p2p_status(src_gpu, dest_gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get link status for %s to %s | %s", + src_gpu_id, + dest_gpu_id, + e.get_error_info()) + p2p_status_cache[key] ={ + "cap": + { + "is_iolink_coherent": -1, + "is_iolink_atomics_32bit": -1, + "is_iolink_atomics_64bit": -1, + "is_iolink_dma": -1, + "is_iolink_bi_directional": -1 + } + } + + return p2p_status_cache[key] + + # Populate the possible gpus + topo_values = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + topo_values.append({"gpu" : src_gpu_id}) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + topo_values[src_gpu_index]['bdf'] = src_gpu_bdf + self.logger.table_header += src_gpu_bdf.rjust(13) + + if not self.logger.is_json_format(): + continue # below is for JSON format only + + ########################## + # JSON formatting start # + ########################## + links = [] + # create json obj for data alignment + # dest_gpu_links = { + # "gpu": GPU # + # "bdf": BDF identification + # "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..) + # "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked; Correlated to access + # "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type + # "num_hops": num_hops - # of hops between devices + # "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes + # "N/A" - self node or not connected devices + # "coherent": coherent - Coherant / Non-Coherant io links + # "atomics": atomics - 32 and 64-bit atomic io link capability between nodes + # "dma": dma - P2P direct memory access (DMA) link capability between nodes + # "bi_dir": bi_dir - P2P bi-directional link capability between nodes + # } + + for dest_gpu_index, dest_gpu in enumerate(args.gpu): + link_type = "SELF" + if src_gpu != dest_gpu: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_INTERNAL: + link_type = "UNKNOWN" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_PCIE: + link_type = "PCIE" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_XGMI: + link_type = "XGMI" + else: + link_type = "N/A" + + numa_bw = "N/A" + if src_gpu != dest_gpu: + try: + bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu) + numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get min max bandwidth for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + weight = 0 + num_hops = 0 + if src_gpu != dest_gpu: + weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) + num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] + link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) + if link_status: + link_status = "ENABLED" + else: + link_status = "DISABLED" + + link_coherent = "SELF" + link_atomics = "SELF" + link_dma = "SELF" + link_bi_dir = "SELF" + + if src_gpu != dest_gpu: + try: + cap = get_cached_p2p_status(src_gpu, dest_gpu)['cap'] + link_coherent = ( + "C" if cap['is_iolink_coherent'] == 1 else + "NC" if cap['is_iolink_coherent'] == 0 else + "N/A" + ) + link_atomics = ( + "64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else + "32" if cap['is_iolink_atomics_32bit'] == 1 else + "64" if cap['is_iolink_atomics_64bit'] == 1 else + "N/A" + ) + link_dma = ( + "T" if cap['is_iolink_dma'] == 1 else + "F" if cap['is_iolink_dma'] == 0 else + "N/A" + ) + link_bi_dir = ( + "T" if cap['is_iolink_bi_directional'] == 1 else + "F" if cap['is_iolink_bi_directional'] == 0 else + "N/A" + ) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get link status for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + + # link_status = amdsmi_is_P2P_accessible(src,dest) + dest_gpu_links = { + "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu), + "bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu), + "weight": weight, + "link_status": link_status, + "link_type": link_type, + "num_hops": num_hops, + "bandwidth": numa_bw, + "coherent": link_coherent, + "atomics": link_atomics, + "dma": link_dma, + "bi_dir": link_bi_dir + } + if not args.access: + del dest_gpu_links['link_status'] + if not args.weight: + del dest_gpu_links['weight'] + if not args.link_type: + del dest_gpu_links['link_type'] + if not args.hops: + del dest_gpu_links['num_hops'] + if not args.numa_bw: + del dest_gpu_links['bandwidth'] + if not args.coherent: + del dest_gpu_links['coherent'] + if not args.atomics: + del dest_gpu_links['atomics'] + if not args.dma: + del dest_gpu_links['dma'] + if not args.bi_dir: + del dest_gpu_links['bi_dir'] + links.append(dest_gpu_links) + dest_end = dest_gpu_index+1 == len(args.gpu) + isEndOfSrc = src_gpu_index+1 == len(args.gpu) + if dest_end: + topo_values[src_gpu_index]['links'] = links + continue + if isEndOfSrc: + self.logger.multiple_device_output = topo_values + self.logger.print_output(multiple_device_enabled=True, tabular=True) + return + ########################## + # JSON formatting end # + ########################## + + if args.access: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_links = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + try: + dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) + if dest_gpu_link_status: + src_gpu_links[dest_gpu_key] = "ENABLED" + else: + src_gpu_links[dest_gpu_key] = "DISABLED" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_links[dest_gpu_key] = "N/A" + logging.debug("Failed to get link status for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['link_accessibility'] = src_gpu_links + + tabular_output_dict.update(src_gpu_links) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "ACCESS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.weight: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_weight = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_weight[dest_gpu_key] = 0 + continue + + try: + dest_gpu_link_weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) + src_gpu_weight[dest_gpu_key] = dest_gpu_link_weight + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_weight[dest_gpu_key] = "N/A" + logging.debug("Failed to get link weight for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['weight'] = src_gpu_weight + + tabular_output_dict.update(src_gpu_weight) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "WEIGHT TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.hops: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_hops = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_hops[dest_gpu_key] = 0 + continue + + try: + dest_gpu_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] + src_gpu_hops[dest_gpu_key] = dest_gpu_hops + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_hops[dest_gpu_key] = "N/A" + logging.debug("Failed to get link hops for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['hops'] = src_gpu_hops + + tabular_output_dict.update(src_gpu_hops) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "HOPS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.link_type: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_link_type = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_link_type[dest_gpu_key] = "SELF" + continue + try: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_INTERNAL: + src_gpu_link_type[dest_gpu_key] = "UNKNOWN" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_PCIE: + src_gpu_link_type[dest_gpu_key] = "PCIE" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_XGMI: + src_gpu_link_type[dest_gpu_key] = "XGMI" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[dest_gpu_key] = "N/A" + logging.debug("Failed to get link type for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['link_type'] = src_gpu_link_type + + tabular_output_dict.update(src_gpu_link_type) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "LINK TYPE TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.numa_bw: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_link_type = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_link_type[dest_gpu_key] = "N/A" + continue + + try: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type != amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_XGMI: + # non_xgmi = True + src_gpu_link_type[dest_gpu_key] = "N/A" + continue + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[dest_gpu_key] = "N/A" + logging.debug("Failed to get link type for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + try: + bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu) + src_gpu_link_type[dest_gpu_key] = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_link_type[dest_gpu_key] = e.get_error_info() + logging.debug("Failed to get min max bandwidth for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['numa_bandwidth'] = src_gpu_link_type + + tabular_output_dict.update(src_gpu_link_type) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "NUMA BW TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.coherent: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_coherent = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_coherent[dest_gpu_key] = "SELF" + continue + try: + iolink_coherent = get_cached_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_coherent'] + src_gpu_coherent[dest_gpu_key] = "C" if iolink_coherent == 1 else "NC" if iolink_coherent == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_coherent[dest_gpu_key] = "N/A" + logging.debug("Failed to get link coherent for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['coherent'] = src_gpu_coherent + + tabular_output_dict.update(src_gpu_coherent) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "CACHE COHERANCY TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.atomics: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_atomics = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_atomics[dest_gpu_key] = "SELF" + continue + try: + cap = get_cached_p2p_status(src_gpu, dest_gpu)['cap'] + src_gpu_atomics[dest_gpu_key] = ( + "64,32" if cap['is_iolink_atomics_32bit'] == 1 and cap['is_iolink_atomics_64bit'] == 1 else + "32" if cap['is_iolink_atomics_32bit'] == 1 else + "64" if cap['is_iolink_atomics_64bit'] == 1 else + "N/A" + ) + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_atomics[dest_gpu_key] = "N/A" + logging.debug("Failed to get link atomics for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['atomics'] = src_gpu_atomics + + tabular_output_dict.update(src_gpu_atomics) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "ATOMICS TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.dma: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_dma = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_dma[dest_gpu_key] = "SELF" + continue + try: + iolink_dma = get_cached_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_dma'] + src_gpu_dma[dest_gpu_key] = "T" if iolink_dma == 1 else "F" if iolink_dma == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_dma[dest_gpu_key] = "N/A" + logging.debug("Failed to get link dma for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['dma'] = src_gpu_dma + + tabular_output_dict.update(src_gpu_dma) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "DMA TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if args.bi_dir: + tabular_output = [] + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + if self.logger.is_human_readable_format(): + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} + else: + tabular_output_dict = {'gpu' : src_gpu_bdf} + src_gpu_bi_dir = {} + for dest_gpu in args.gpu: + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_key = f'gpu_{dest_gpu_id}' + + if src_gpu == dest_gpu: + src_gpu_bi_dir[dest_gpu_key] = "SELF" + continue + try: + iolink_bi_dir = get_cached_p2p_status(src_gpu, dest_gpu)['cap']['is_iolink_bi_directional'] + src_gpu_bi_dir[dest_gpu_key] = "T" if iolink_bi_dir == 1 else "F" if iolink_bi_dir == 0 else "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + src_gpu_bi_dir[dest_gpu_key] = "N/A" + logging.debug("Failed to get link bi-directional for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + topo_values[src_gpu_index]['bi_dir'] = src_gpu_bi_dir + + tabular_output_dict.update(src_gpu_bi_dir) + tabular_output.append(tabular_output_dict) + + if self.logger.is_human_readable_format(): + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "BI-DIRECTIONAL TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + if self.logger.is_human_readable_format(): + # Populate the legend output + legend_parts = [ + "\n\nLegend:", + " SELF = Current GPU", + " ENABLED / DISABLED = Link is enabled or disabled", + " N/A = Not supported", + " T/F = True / False", + " C/NC = Coherant / Non-Coherant io links", + " 64,32 = 64 bit and 32 bit atomic support", + " -" + ] + legend_output = "\n".join(legend_parts) + + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') + + self.logger.multiple_device_output = topo_values + + if self.logger.is_csv_format(): + new_output = [] + for elem in self.logger.multiple_device_output: + new_output.append(self.logger.flatten_dict(elem, topology_override=True)) + self.logger.multiple_device_output = new_output + + if not self.logger.is_human_readable_format(): + self.logger.print_output(multiple_device_enabled=True) + + + def set_core(self, args, multiple_devices=False, core=None, core_boost_limit=None): + """Issue set commands to target core(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + core (device_handle, optional): device_handle for target device. Defaults to None. + core_boost_limit (list, optional): Value override for args.core_boost_limit. Defaults to None. Defaults to None. + + Raises: + ValueError: Value error if no core value is provided + IndexError: Index error if core list is empty + + Return: + Nothing + """ + if core: + args.core = core + if core_boost_limit: + args.core_boost_limit = core_boost_limit + + if args.core == None: + raise ValueError('No Core provided, specific Core targets(S) are needed') + + # Handle multiple cores + handled_multiple_cores, device_handle = self.helpers.handle_cores(args, self.logger, self.set_core) + if handled_multiple_cores: + return # This function is recursive + + # Error if no subcommand args are passed + if not any([args.core_boost_limit]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + args.core = device_handle + # build core string for errors + try: + core_id = self.helpers.get_core_id_from_device_handle(args.core) + except IndexError: + core_id = f'ID Unavailable for {args.core}' + + static_dict = {} + if args.core_boost_limit: + static_dict["set_core_boost_limit"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_core_boostlimit(args.core, args.core_boost_limit[0][0]) + #Verify the core boost limit is set + boost_limit = amdsmi_interface.amdsmi_get_cpu_core_boostlimit(args.core) + # Extract numeric value from response (remove units if present) + if isinstance(boost_limit, str): + # Extract just the number part (assumes format like "5000 MHz" or "5000") + boost_limit = int(boost_limit.split()[0]) + else: + boost_limit = int(boost_limit) + + if boost_limit < args.core_boost_limit[0][0]: + static_dict["set_core_boost_limit"]["Response"] = f"Max allowed boostlimit is {boost_limit} MHz" + elif boost_limit > args.core_boost_limit[0][0]: + static_dict["set_core_boost_limit"]["Response"] = f"Min allowed boostlimit is {boost_limit} MHz" + else: + static_dict["set_core_boost_limit"]["Response"] = f"{boost_limit} MHz" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_core_boost_limit"]["Response"] = f"Error occurred for Core {core_id} - {e.get_error_info()}" + logging.debug("Failed to set core boost limit for core %s | %s", core_id, e.get_error_info()) + + multiple_devices_csv_override = False + self.logger.store_core_output(args.core, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None, + cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, + cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, + cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None): + """Issue set commands to target cpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None. + cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None. + cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None. + cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None. + cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None. + cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None. + cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None. + cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None. + cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None. + soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None. + + Raises: + ValueError: Value error if no cpu value is provided + IndexError: Index error if cpu list is empty + + Return: + Nothing + """ + if cpu: + args.cpu = cpu + if cpu_pwr_limit: + args.cpu_pwr_limit = cpu_pwr_limit + if cpu_xgmi_link_width: + args.cpu_xgmi_link_width = cpu_xgmi_link_width + if cpu_lclk_dpm_level: + args.cpu_lclk_dpm_level = cpu_lclk_dpm_level + if cpu_pwr_eff_mode: + args.cpu_pwr_eff_mode = cpu_pwr_eff_mode + if cpu_gmi3_link_width: + args.cpu_gmi3_link_width = cpu_gmi3_link_width + if cpu_pcie_link_rate: + args.cpu_pcie_link_rate = cpu_pcie_link_rate + if cpu_df_pstate_range: + args.cpu_df_pstate_range = cpu_df_pstate_range + if cpu_enable_apb: + args.cpu_enable_apb = cpu_enable_apb + if cpu_disable_apb: + args.cpu_disable_apb = cpu_disable_apb + if soc_boost_limit: + args.soc_boost_limit = soc_boost_limit + + if args.cpu == None: + raise ValueError('No CPU provided, specific CPU targets(S) are needed') + + #Handle multiple CPU's + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, self.logger, self.set_cpu) + if handled_multiple_cpus: + return # This function is recursive + + args.cpu = device_handle + #Error if no subcommand args are passed + if not any([args.cpu_pwr_limit, args.cpu_xgmi_link_width, args.cpu_lclk_dpm_level, + args.cpu_pwr_eff_mode, args.cpu_gmi3_link_width, args.cpu_pcie_link_rate, + args.cpu_df_pstate_range, args.cpu_enable_apb, args.cpu_disable_apb, + args.soc_boost_limit]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + # Build CPU string for errors + try: + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + except IndexError: + cpu_id = f'ID Unavailable for {args.cpu}' + + static_dict = {} + + if args.cpu_pwr_limit: + static_dict["set_pwr_limit"] = {} + try: + soc_max_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap_max(args.cpu) + extract_numeric = soc_max_pwr_limit.split()[0] + max_power = int(extract_numeric) + + amdsmi_interface.amdsmi_set_cpu_socket_power_cap(args.cpu, args.cpu_pwr_limit[0][0]) + if args.cpu_pwr_limit[0][0] > max_power: + args.cpu_pwr_limit[0][0] = max_power + static_dict["set_pwr_limit"]["Response"] = f"{args.cpu_pwr_limit[0][0] / 1000:.3f} mW" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pwr_limit"]["Response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set power limit for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_xgmi_link_width: + static_dict["set_xgmi_link_width"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_xgmi_width(args.cpu, args.cpu_xgmi_link_width[0][0], + args.cpu_xgmi_link_width[0][1]) + static_dict["set_xgmi_link_width"]["Response"] = f"{args.cpu_xgmi_link_width[0][0]} - {args.cpu_xgmi_link_width[0][1]}" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_xgmi_link_width"]["Response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set xgmi link width for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_lclk_dpm_level: + static_dict["set_lclk_dpm_level"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_socket_lclk_dpm_level(args.cpu, args.cpu_lclk_dpm_level[0][0], + args.cpu_lclk_dpm_level[0][1], + args.cpu_lclk_dpm_level[0][2]) + static_dict["set_lclk_dpm_level"]["Response"] = f"NBIO[{args.cpu_lclk_dpm_level[0][0]}]" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_lclk_dpm_level"]["Response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set lclk dpm level for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_pwr_eff_mode: + static_dict["set_pwr_eff_mode"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_pwr_efficiency_mode(args.cpu, args.cpu_pwr_eff_mode[0][0]) + static_dict["set_pwr_eff_mode"]["Response"] = f"{args.cpu_pwr_eff_mode[0][0]}" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pwr_eff_mode"]["Response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set power efficiency mode for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_gmi3_link_width: + static_dict["set_gmi3_link_width"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_gmi3_link_width_range(args.cpu, args.cpu_gmi3_link_width[0][0], + args.cpu_gmi3_link_width[0][1]) + static_dict["set_gmi3_link_width"]["response"] = f"{args.cpu_gmi3_link_width[0][0]} - {args.cpu_gmi3_link_width[0][1]}" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_gmi3_link_width"]["response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set gmi3 link width for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_pcie_link_rate: + static_dict["set_pcie_link_rate"] = {} + try: + resp = amdsmi_interface.amdsmi_set_cpu_pcie_link_rate(args.cpu, args.cpu_pcie_link_rate[0][0]) + static_dict["set_pcie_link_rate"]["prev_mode"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pcie_link_rate"]["prev_mode"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set pcie link rate for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_df_pstate_range: + static_dict["set_df_pstate_range"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_df_pstate_range(args.cpu, args.cpu_df_pstate_range[0][0], + args.cpu_df_pstate_range[0][1]) + static_dict["set_df_pstate_range"]["response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_df_pstate_range"]["response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set df pstate range for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_enable_apb: + static_dict["apbenable"] = {} + try: + amdsmi_interface.amdsmi_cpu_apb_enable(args.cpu) + static_dict["apbenable"]["state"] = "Enabled DF - Pstate performance boost algorithm" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["apbenable"]["state"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_disable_apb: + static_dict["apbdisable"] = {} + try: + amdsmi_interface.amdsmi_cpu_apb_disable(args.cpu, args.cpu_disable_apb[0][0]) + static_dict["apbdisable"]["state"] = "Disabled DF - Pstate performance boost algorithm" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["apbdisable"]["state"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.soc_boost_limit: + static_dict["set_soc_boost_limit"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_socket_boostlimit(args.cpu, args.soc_boost_limit[0][0]) + static_dict["set_soc_boost_limit"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + #static_dict["set_soc_boost_limit"]["Response"] = "N/A" + static_dict["set_soc_boost_limit"]["Response"] = f"Error occurred for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set socket boost limit for cpu %s | %s", cpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, + profile=None, perf_determinism=None, compute_partition=None, + memory_partition=None, power_cap=None, soc_pstate=None, xgmi_plpd = None, + process_isolation=None, clk_limit=None, clk_level=None): + """Issue reset commands to target gpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + fan (int, optional): Value override for args.fan. Defaults to None. + perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None. + compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. + memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. + power_cap (int, optional): Value override for args.power_cap. Defaults to None. + soc_pstate (int, optional): Value override for args.soc_pstate. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if fan is not None: + args.fan = fan + if perf_level: + args.perf_level = perf_level + if profile: + args.profile = profile + if perf_determinism is not None: + args.perf_determinism = perf_determinism + if compute_partition: + args.compute_partition = compute_partition + if memory_partition: + args.memory_partition = memory_partition + if power_cap: + args.power_cap = power_cap + if soc_pstate: + args.soc_pstate = soc_pstate + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd + if process_isolation: + args.process_isolation = process_isolation + if clk_limit: + args.clk_limit = clk_limit + if clk_level: + args.clk_level = clk_level + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu) + if handled_multiple_gpus: + return # This function is recursive + + args.gpu = device_handle + + # Error if no subcommand args are passed + if self.helpers.is_baremetal(): + if not any([args.fan is not None, + args.perf_level, + args.profile, + args.compute_partition, + args.memory_partition, + args.perf_determinism is not None, + args.power_cap is not None, + args.soc_pstate is not None, + args.xgmi_plpd is not None, + args.clk_level is not None, + args.clk_limit is not None, + args.process_isolation is not None]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + else: + if not any([args.power_cap is not None, + args.clk_limit is not None, + args.process_isolation is not None]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + # Build GPU string for errors + try: + gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) + except amdsmi_exception.AmdSmiLibraryException: + gpu_bdf = f'BDF Unavailable for {args.gpu}' + try: + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + except IndexError: + gpu_id = f'ID Unavailable for {args.gpu}' + gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}" + + # Handle args + if self.helpers.is_baremetal(): + if isinstance(args.fan, int): + # Convert fan speed to percentage + # Note: amdsmi_set_gpu_fan_speed expects fan speed in RPM, so + # we convert the value to a percentage based on the maximum fan speed of 255 RPM. + # We need to round down the user's passed fan speed % to the nearest whole number. + # This allows us to match the float -> int conversion when converting from percentage to RPM (as previously passed by the parser). + fan_percentage = int((int(args.fan) / 255) * 100 // 1) # round down (aka floor) to nearest whole number + try: + amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = f"[{e.get_error_info(detailed=False)}] Unable to set fan speed to {args.fan} RPM ({fan_percentage}%)" + self.logger.store_output(args.gpu, 'fan', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed to {args.fan} RPM ({fan_percentage}%)") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.perf_level: + perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level] + try: + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'perflevel', f"[{e.get_error_info(detailed=False)}] Unable to set performance level to {args.perf_level}") + perf_options = str(self.helpers.get_perf_levels()[0][0:-1]).replace("[", "").replace("]", "").replace("'", "").replace(" ", "") + print(f"\nPerformance Level Options:\n\t{perf_options}\n") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.profile: + self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.perf_determinism, int): + try: + amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perf_determinism) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'perfdeterminism', f"[{e.get_error_info(detailed=False)}] Unable to enable performance determinism and set GFX clock frequency to {args.perf_determinism} MHz") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism} MHz") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.compute_partition: + current_set_count = self.helpers.get_set_count() + future_set_count = 0 + attempted_to_set = "N/A" + user_requested_partition_args = "N/A" + try: + (accelerator_set_choices, accelerator_profiles) = self.helpers.get_accelerator_choices_types_indices() + logging.debug("args.compute_partition: %s; Accelerator_set_choices: %s", str(args.compute_partition), str(json.dumps(accelerator_set_choices, indent=4))) + if args.compute_partition in accelerator_profiles['profile_types']: + compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] + index = accelerator_profiles['profile_types'].index(args.compute_partition) + attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]}) on {gpu_string}" + user_requested_partition_args = f"{args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})" + amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) + elif args.compute_partition in accelerator_profiles['profile_indices']: + compute_partition = int(args.compute_partition) + index = accelerator_profiles['profile_indices'].index(args.compute_partition) + attempted_to_set = f"Attempted to set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition}) on {gpu_string}" + user_requested_partition_args = f"{accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})" + amdsmi_interface.amdsmi_set_gpu_accelerator_partition_profile(args.gpu, compute_partition) + else: + raise ValueError(f"Invalid accelerator configuration {args.compute_partition} on {gpu_string}") + self.helpers.increment_set_count() + future_set_count = self.helpers.get_set_count() + if current_set_count == future_set_count-1: + self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + self.helpers.increment_set_count() + future_set_count = self.helpers.get_set_count() + if current_set_count == future_set_count-1: + out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Unable to set compute partition to {user_requested_partition_args}" + self.logger.store_output(args.gpu, 'accelerator_partition', out) + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE: + print(f"\n{attempted_to_set}\n" + f"\n[AMDSMI_STATUS_SETTING_UNAVAILABLE] Please check amd-smi partition --memory --accelerator for available profiles.\n" + "Users may need to switch memory partition to another mode in order to enable the desired accelerator partition.\n") + raise ValueError(f"[AMDSMI_STATUS_SETTING_UNAVAILABLE] Unable to set accelerator partition to {args.compute_partition} on {gpu_string}") from e + else: + raise ValueError(f"Unable to set accelerator partition to {args.compute_partition} on {gpu_string}") from e + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.memory_partition: + #################################################################### + # Get current and available memory partition modes # + # Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar # + #################################################################### + self.helpers.increment_set_count() + set_count = self.helpers.get_set_count() + if set_count == 1: # only show reload warning on 1st set + self.helpers.confirm_changing_memory_partition_gpu_reload_warning() + try: + memory_dict = {'caps': "N/A", 'current': "N/A"} + memory_partition_config = amdsmi_interface.amdsmi_get_gpu_memory_partition_config(args.gpu) + memory_dict['caps'] = str(memory_partition_config['partition_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + memory_dict['current'] = memory_partition_config['mp_mode'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info()) + try: + memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] + amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition) + out = f"Successfully set memory partition to {args.memory_partition}, reload driver when ready" + except amdsmi_exception.AmdSmiLibraryException as e: + out = f"[{e.get_error_info(detailed=False)}] Unable to set memory partition to {args.memory_partition}" + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + out = f"[AMDSMI_STATUS_NO_PERM] Command requires elevation" + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + raise PermissionError('Command requires elevation') from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: + print(f"Valid Memory partition Modes: {memory_dict['caps']}\n") + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + else: + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.soc_pstate, int): + try: + amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'socpstate', f"[{e.get_error_info(detailed=False)}] Unable to set soc pstate dpm policy to {args.soc_pstate}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'socpstate', f"Successfully set soc pstate dpm policy to {args.soc_pstate}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.xgmi_plpd, int): + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"[{e.get_error_info(detailed=False)}] Unable to set XGMI per-link power down policy to {args.xgmi_plpd}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set XGMI per-link power down policy to {args.xgmi_plpd}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.clk_level, tuple): + + clk_type = args.clk_level.clk_type + perf_levels = args.clk_level.perf_levels + perf_levels_str = str(perf_levels).strip('[]').replace(" ", "") + smi_clk_type_mapping = { + "sclk": amdsmi_interface.AmdSmiClkType.SYS, + "mclk": amdsmi_interface.AmdSmiClkType.MEM, + "pcie": amdsmi_interface.AmdSmiClkType.PCIE, + "fclk": amdsmi_interface.AmdSmiClkType.DF, + "socclk": amdsmi_interface.AmdSmiClkType.SOC + } + results_clk_lvl = {'perf_level': f"Unable to set performance level to MANUAL", + 'get_clock_freq': f"Unable to retrieve {clk_type} frequency levels", + 'set_clock': f"Unable to set {clk_type} perf level(s) to {perf_levels_str}"} + if clk_type not in smi_clk_type_mapping: + raise ValueError(f"Invalid clock type {clk_type}. Valid options are: {', '.join(smi_clk_type_mapping.keys())}") + + # Set perf level to manual if not already set + try: + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) + results_clk_lvl['perf_level'] = f"Successfully set performance level to MANUAL" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + results_clk_lvl['perf_level'] = f"[{e.get_error_info(detailed=False)}] Unable to set performance level to MANUAL" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + if clk_type.lower() == "pcie": + # Get PCIe bandwidth levels + try: + pcie_bandwidth_levels = amdsmi_interface.amdsmi_get_gpu_pci_bandwidth(args.gpu) + num_supported = pcie_bandwidth_levels['transfer_rate']['num_supported'] + results_clk_lvl['get_clock_freq'] = f"Successfully retrieved {clk_type} frequency levels" + except amdsmi_exception.AmdSmiLibraryException as e: + results_clk_lvl['get_clock_freq'] = f"[{e.get_error_info(detailed=False)}] Unable to retrieve {clk_type} frequency levels" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + else: + # Get clock frequency levels + try: + frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, smi_clk_type_mapping[clk_type]) + num_supported = frequencies['num_supported'] + results_clk_lvl['get_clock_freq'] = f"Successfully retrieved {clk_type} frequency levels" + except amdsmi_exception.AmdSmiLibraryException as e: + results_clk_lvl['get_clock_freq'] = f"[{e.get_error_info(detailed=False)}] Unable to retrieve {clk_type} frequency levels" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + # Validate bandwidth bitmask + freq_bitmask = 0 + invalid_levels = [] + for level in perf_levels: + if level < num_supported: + freq_bitmask |= (1 << level) + else: + invalid_levels.append(level) + + if invalid_levels: + # Handle/report invalid levels + invalid_levels_str = str(invalid_levels).strip('[]').replace(" ", "") + valid_levels_str = f"Valid levels for {clk_type}: 0" + if num_supported > 1: + valid_levels_str = f"Valid levels for {clk_type}: 0-{num_supported-1}" + print(f"\n{valid_levels_str}\n") + results_clk_lvl['set_clock'] = f"Invalid level(s) {invalid_levels_str} are not within the range of supported levels for {clk_type}" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + else: + # Proceed with freq_bitmask + pass + + if clk_type.lower() == "pcie": + try: + amdsmi_interface.amdsmi_set_gpu_pci_bandwidth(args.gpu, freq_bitmask) + results_clk_lvl['set_clock'] = f"Successfully set {clk_type} perf level(s) to {perf_levels_str}" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + + results_clk_lvl['set_clock'] = f"[{e.get_error_info(detailed=False)}] Unable to set {clk_type} perf level(s) to {perf_levels_str}" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + else: + # For non-pcie clocks + if clk_type in self.convert_clock_type: + clk_type_conversion = self.convert_clock_type[clk_type] + else: + clk_type_conversion = "N/A" + + try: + amdsmi_interface.amdsmi_set_clk_freq(args.gpu, clk_type, freq_bitmask) + results_clk_lvl['set_clock'] = f"Successfully set {clk_type} perf level(s) to {perf_levels_str}" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + results_clk_lvl['set_clock'] = f"[{e.get_error_info(detailed=False)}] Unable to set {clk_type} perf level(s) to {perf_levels_str}" + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'clk_level', results_clk_lvl) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + # Universal args + if isinstance(args.power_cap, tuple): + pwr_type = args.power_cap.pwr_type + pwr_type_as_int = (0 if pwr_type == "ppt0" else 1 if pwr_type == "ppt1" else None) + pwr_type = pwr_type.upper() + requested_power_cap = args.power_cap.watts + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, pwr_type_as_int) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + min_power_cap = power_cap_info["min_power_cap"] + min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + max_power_cap = power_cap_info["max_power_cap"] + max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap = power_cap_info["power_cap"] + current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + except amdsmi_exception.AmdSmiLibraryException as e: + min_power_cap = "N/A" + max_power_cap = "N/A" + current_power_cap = "N/A" + self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + if requested_power_cap == current_power_cap: + self.logger.store_output(args.gpu, 'powercap', f"{pwr_type} power cap is already set to {requested_power_cap}W") + elif current_power_cap == 0: + self.logger.store_output(args.gpu, 'powercap', f"Unable to set {pwr_type} power cap to {requested_power_cap}W, current value is {current_power_cap}W") + elif requested_power_cap >= min_power_cap and requested_power_cap <= max_power_cap and requested_power_cap > 0: + try: + new_power_cap = self.helpers.convert_SI_unit(requested_power_cap, AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, pwr_type_as_int, new_power_cap) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'powercap', f"[{e.get_error_info(detailed=False)}] Unable to set {pwr_type} power cap to {requested_power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'powercap', f"Successfully set {pwr_type} power cap to {requested_power_cap}W") + else: + # setting power cap to 0 will return the current power cap so the technical minimum value is 1 + if min_power_cap == 0: + min_power_cap = 1 + self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap}W and {max_power_cap}W") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.clk_limit, tuple): + clk_type = args.clk_limit.clk_type + lim_type = args.clk_limit.lim_type + val = args.clk_limit.val + val_changed = True # Assume Clock limit value is changed + + # Validate the value against the extremum + try: + # Parser only allows two options sclk or mclk + if clk_type == "sclk": + amdsmi_clk_type = amdsmi_interface.AmdSmiClkType.GFX + elif clk_type == "mclk": + amdsmi_clk_type = amdsmi_interface.AmdSmiClkType.MEM + else: + print(f"Valid clock types are: sclk, mclk\n") + self.logger.store_output(args.gpu, 'clk_limit', f"Invalid clock type {args.clk_limit.clk_type}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + clk_tuple = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_clk_type) + + if lim_type == "min": + amdsmi_lim_type = amdsmi_interface.AmdSmiClkLimitType.MIN + if val > clk_tuple['max_clk']: + self.logger.store_output(args.gpu, 'clk_limit', f"Cannot set {args.clk_limit.clk_type} min value greater than max ({clk_tuple['max_clk']}MHz)") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + if val == clk_tuple['min_clk']: + val_changed = False # Clock limit value did not changed + elif lim_type == "max": + amdsmi_lim_type = amdsmi_interface.AmdSmiClkLimitType.MAX + if val < clk_tuple['min_clk']: + self.logger.store_output(args.gpu, 'clk_limit', f"Cannot set {args.clk_limit.clk_type} max value less than min ({clk_tuple['min_clk']}MHz)") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if val == clk_tuple['max_clk']: + val_changed = False # Clock limit value did not changed + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED and lim_type == "min" and clk_type == "mclk": + logging.debug("Setting mclk min is not supported") + self.logger.store_output(args.gpu, 'clk_limit', f"Setting mclk min is not supported") + else: + logging.debug("Failed to get clock extremum info for gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'clk_limit', f"[{e.get_error_info(detailed=False)}] Unable to change {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}MHz") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + # Set the value + try: + if val_changed: + amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED and lim_type == "min" and clk_type == "mclk": + logging.debug("Setting mclk min is not supported") + self.logger.store_output(args.gpu, 'clk_limit', f"Setting mclk min is not supported") + else: + self.logger.store_output(args.gpu, 'clk_limit', f"[{e.get_error_info(detailed=False)}] Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}MHz") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + if val_changed: + self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}MHz") + else: + self.logger.store_output(args.gpu, 'clk_limit', f"Clock limit is already set to {args.clk_limit.val}MHz") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if isinstance(args.process_isolation, int): + status_string = "Enabled" if args.process_isolation else "Disabled" + result = f"Requested process isolation to {status_string}" # This should not print out + try: + current_status = amdsmi_interface.amdsmi_get_gpu_process_isolation(args.gpu) + if current_status == args.process_isolation: + result = f"Process isolation is already {status_string}" + else: + amdsmi_interface.amdsmi_set_gpu_process_isolation(args.gpu, args.process_isolation) + result = f"Successfully set process isolation to {status_string}" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + self.logger.store_output(args.gpu, 'process_isolation', f"[{e.get_error_info(detailed=False)}] Unable to set process isolation to {status_string}") + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + self.logger.store_output(args.gpu, 'process_isolation', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, + profile=None, perf_determinism=None, compute_partition=None, + memory_partition=None, power_cap=None, + cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, + cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, + cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, + soc_boost_limit=None, core=None, core_boost_limit=None, soc_pstate=None, xgmi_plpd=None, + process_isolation=None, clk_limit=None, clk_level=None): + """Issue reset commands to target gpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + fan (int, optional): Value override for args.fan. Defaults to None. + perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None. + compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. + memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. + power_cap (int, optional): Value override for args.power_cap. Defaults to None. + + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None. + cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None. + cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None. + cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None. + cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None. + cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None. + cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None. + cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None. + cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None. + soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None. + + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None + soc_pstate (int, optional): Value override for args.soc_pstate. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. + process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # These are the only args checked at this point, the other args will be passed + # in through the applicable function set_gpu, set_cpu, or set_core function + if gpu: + args.gpu = gpu + if cpu: + args.cpu = cpu + if core: + args.core = core + + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", + "memory_partition", "power_cap", "soc_pstate", "xgmi_plpd", + "process_isolation", "clk_limit", "clk_level"] + for attr in gpu_attributes: + if hasattr(args, attr): + if getattr(args, attr) is not None: + gpu_args_enabled = True + break + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", + "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range", + "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] + for attr in cpu_attributes: + if hasattr(args, attr): + if getattr(args, attr) not in [None, False]: + cpu_args_enabled = True + break + + # Check if a Core argument has been set + core_args_enabled = False + core_attributes = ["core_boost_limit"] + for attr in core_attributes: + if hasattr(args, attr): + if getattr(args, attr) is not None: + core_args_enabled = True + break + + # Error if no subcommand args are passed + if self.helpers.is_baremetal(): + is_gpu_set = False + is_cpu_set = False + is_core_set = False + try: + is_gpu_set = any([ + args.gpu is not None, + args.fan is not None, + args.perf_level is not None, + args.profile is not None, + args.perf_determinism is not None, + args.compute_partition is not None, + args.memory_partition is not None, + args.power_cap is not None, + args.soc_pstate is not None, + args.xgmi_plpd is not None, + args.clk_limit is not None, + args.clk_level is not None, + args.process_isolation is not None + ]) + except AttributeError: + # If attribute error for gpu, then we could be another subcommand + pass + + try: + is_cpu_set = any([ + args.cpu is not None, + args.cpu_pwr_limit is not None, + args.cpu_xgmi_link_width is not None, + args.cpu_lclk_dpm_level is not None, + args.cpu_pwr_eff_mode is not None, + args.cpu_gmi3_link_width is not None, + args.cpu_pcie_link_rate is not None, + args.cpu_df_pstate_range is not None, + args.cpu_enable_apb, + args.cpu_disable_apb is not None, + args.soc_boost_limit is not None + ]) + except AttributeError: + # If attribute error for cpu, then we could be another subcommand + pass + try: + if args.core_boost_limit: + is_core_set = True + except AttributeError: + # If attribute error for core, then we could be another subcommand + pass + + if not (is_gpu_set or is_cpu_set or is_core_set): + # if neither GPU / CPU / or Core args are provided, then raise error message + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + else: + if not any([args.process_isolation is not None, args.clk_limit is not None, args.power_cap is not None]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + # Only allow one device's arguments to be set at a time + if not any([gpu_args_enabled, cpu_args_enabled, core_args_enabled]): + raise ValueError('No GPU, CPU, or CORE arguments provided, specific arguments are needed') + elif all([gpu_args_enabled, cpu_args_enabled, core_args_enabled]): + raise ValueError('Cannot set GPU, CPU, and CORE arguments at the same time') + elif not (gpu_args_enabled ^ cpu_args_enabled ^ core_args_enabled): + raise ValueError('Cannot set GPU, CPU, or CORE arguments at the same time') + + if self.helpers.is_amdgpu_initialized() and gpu_args_enabled: + if args.gpu == None: + args.gpu = self.device_handles + + if self.helpers.is_amd_hsmp_initialized() and cpu_args_enabled: + if args.cpu == None: + args.cpu = self.cpu_handles + + if self.helpers.is_amd_hsmp_initialized() and core_args_enabled: + if args.core == None: + args.core = self.core_handles + + + # Handle CPU and GPU intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + # Print out all CPU and all GPU static info only if no device was specified. + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None and args.core == None: + raise ValueError('No GPU, CPU, or CORE provided, specific target(s) are needed') + + if args.cpu: + self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit, + cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode, + cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range, + cpu_enable_apb, cpu_disable_apb, soc_boost_limit) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.set_core(args, multiple_devices, core, core_boost_limit) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.set_gpu(args, multiple_devices, gpu, fan, perf_level, + profile, perf_determinism, compute_partition, + memory_partition, power_cap, soc_pstate, xgmi_plpd, + process_isolation, clk_limit, clk_level) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None and args.core == None: + raise ValueError('No CPU or CORE provided, specific target(s) are needed') + if args.cpu: + self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit, + cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode, + cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range, + cpu_enable_apb, cpu_disable_apb, soc_boost_limit) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_output() + self.set_core(args, multiple_devices, core, core_boost_limit) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + args.gpu = self.device_handles + self.logger.clear_multiple_devices_output() + self.set_gpu(args, multiple_devices, gpu, fan, perf_level, + profile, perf_determinism, compute_partition, + memory_partition, power_cap, soc_pstate, xgmi_plpd, + process_isolation, clk_limit, clk_level) + + + def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, + clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None, + power_cap=None, reload_driver=None, clean_local_data=None): + """Issue reset commands to target gpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + gpureset (bool, optional): Value override for args.gpureset. Defaults to None. + clocks (bool, optional): Value override for args.clocks. Defaults to None. + fans (bool, optional): Value override for args.fans. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None. + perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None. + power_cap (bool, optional): Value override for args.power_cap. Defaults to None. + clean_local_data (bool, optional): Value override for args.run_cleaner_shader. Defaults to None. + + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if gpureset: + args.gpureset = gpureset + if clocks: + args.clocks = clocks + if fans: + args.fans = fans + if profile: + args.profile = profile + if xgmierr: + args.xgmierr = xgmierr + if perf_determinism: + args.perf_determinism = perf_determinism + if power_cap: + args.power_cap = power_cap + if reload_driver: + args.reload_driver = reload_driver + if clean_local_data: + args.clean_local_data = clean_local_data + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset) + if handled_multiple_gpus: + return # This function is recursive + + args.gpu = device_handle + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + # Error if no subcommand args are passed + if self.helpers.is_baremetal(): + if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \ + args.perf_determinism, args.power_cap, args.reload_driver, \ + args.clean_local_data]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + else: + if not any([args.clean_local_data, args.reload_driver]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + ####################### + # BM commands - START # + ####################### + + if self.helpers.is_baremetal(): + if args.gpureset: + if self.helpers.is_amd_device(args.gpu): + try: + amdsmi_interface.amdsmi_reset_gpu(args.gpu) + result = 'Successfully reset GPU' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = f"[{e.get_error_info(detailed=False)}] Unable to reset GPU" + self.logger.store_output(args.gpu, 'gpu_reset', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + else: + result = 'Unable to reset non-amd GPU' + self.logger.store_output(args.gpu, 'gpu_reset', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.clocks: + reset_clocks_results = {'overdrive': '', + 'clocks': '', + 'performance': ''} + try: + amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0) + reset_clocks_results['overdrive'] = 'Overdrive set to 0' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) + reset_clocks_results['overdrive'] = f"[{e.get_error_info(detailed=False)}] Unable to reset overdrive to 0" + # continue to reset clocks and performance level + try: + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + reset_clocks_results['clocks'] = 'Successfully reset performance level to auto' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_clocks_results['clocks'] = f"[{e.get_error_info(detailed=False)}] Unable to reset performance level to auto" + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + + try: + #TODO: Check why this is called twice? + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + reset_clocks_results['performance'] = 'Successfully reset performance level to auto' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_clocks_results['performance'] = f"[{e.get_error_info(detailed=False)}] Unable to reset performance level to auto" + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.fans: + try: + amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0) + result = 'Successfully reset fan speed to driver control' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = f"[{e.get_error_info(detailed=False)}] Unable to reset fan speed to driver control" + logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_fans', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'reset_fans', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.profile: + reset_profile_results = {'power_profile' : 'N/A'} + try: + power_profile_mask = amdsmi_interface.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT + amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask) + reset_profile_results['power_profile'] = 'Successfully reset Power Profile to default (bootup default)' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_profile_results['power_profile'] = f"[{e.get_error_info(detailed=False)}] Unable to reset Power Profile to default (bootup default)" + logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.xgmierr: + try: + amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu) + result = 'Successfully reset XGMI Error count' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) + result = f"[{e.get_error_info(detailed=False)}] Unable to reset XGMI Error count" + self.logger.store_output(args.gpu, 'reset_xgmi_err', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'reset_xgmi_err', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.perf_determinism: + try: + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + result = 'Successfully reset Performance Level to default (auto)' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) + result = f"[{e.get_error_info(detailed=False)}] Unable to reset Performance Level to default (auto)" + self.logger.store_output(args.gpu, 'reset_perf_determinism', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'reset_perf_determinism', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + if args.power_cap: + final_output = {"ppt0": "[AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap", "ppt1": "[AMDSMI_STATUS_NOT_SUPPORTED] Unable to reset to default power cap"} + power_limit_types = {} + for power_type in amdsmi_interface.AmdSmiPowerCapType: + # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase + key = power_type.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() + power_limit_types[key] = "N/A" + current_sensor_num = 0 + + try: + power_cap_types = amdsmi_interface.amdsmi_get_supported_power_cap(args.gpu) + for sensor in power_cap_types['sensor_inds']: + current_sensor_num = sensor + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, sensor) + logging.debug(f"Power cap info for gpu {gpu_id} ppt{sensor} | {power_cap_info}") + default_power_cap_in_mw = power_cap_info["default_power_cap"] + default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_mw, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap_in_mw = power_cap_info["power_cap"] + current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_mw, AMDSMIHelpers.SI_Unit.MICRO) + sensor_name = power_cap_types['sensor_types'][sensor] + # Strip 'AMDSMI_POWER_CAP_TYPE_' prefix and convert to lowercase + sensor_key = sensor_name.name.replace('AMDSMI_POWER_CAP_TYPE_', '').lower() + power_limit_types[sensor_key] = (default_power_cap_in_w, current_power_cap_in_w) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, sensor, default_power_cap_in_mw) + final_output[f"ppt{current_sensor_num}"] = f"Successfully reset power cap to {default_power_cap_in_w}W" + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + final_output[f"ppt{current_sensor_num}"] = f"[{e.get_error_info(detailed=False)}] Unable to reset cap to default power cap" + self.logger.store_output(args.gpu, 'powercap', final_output) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + ####################### + # BM commands - END # + ####################### + + if args.clean_local_data: + try: + amdsmi_interface.amdsmi_clean_gpu_local_data(args.gpu) + result = 'Successfully clean GPU local data' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = f"[{e.get_error_info(detailed=False)}] Unable to clean local data" + self.logger.store_output(args.gpu, 'clean_local_data', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + self.logger.store_output(args.gpu, 'clean_local_data', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + # Adding to VMs since, they should also support same reload as baremetal + if args.reload_driver: + # Check permissions BEFORE starting any processes + # Required to avoid permission errors when starting the progress bar + try: + if os.geteuid() != 0: + result = "[AMDSMI_STATUS_NO_PERM] Command requires elevation" + self.logger.store_output(args.gpu, 'reload_driver', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + raise PermissionError('Command requires elevation') + except AttributeError: + pass # os.geteuid() not available on Windows + lock = multiprocessing.Lock() + lock.acquire() + is_lock_released = False + progress_process = None + try: + self.helpers.increment_set_count() + set_count = self.helpers.get_set_count() + if set_count == 1: + self.helpers.confirm_gpu_driver_reload_warning() + # Start progress bar in separate process + string_out = f"Reloading driver for all AMD GPUs:" + progress_process = multiprocessing.Process( + target=self.helpers.showProgressbar, + args=(string_out, 140, True) + ) + progress_process.start() + # Perform the actual driver reload (this is where permission error occurs) + amdsmi_interface.amdsmi_gpu_driver_reload() + # If we get here, operation was successful + self.helpers.assign_previous_set_success_check(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SUCCESS) + result = "Successfully reloaded driver" + else: + if self.helpers.get_previous_set_success_check() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SUCCESS: + result = "Successfully reloaded driver" + elif self.helpers.get_previous_set_success_check() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + result = "[AMDSMI_STATUS_NO_PERM] Command requires elevation" + raise PermissionError('Command requires elevation') + else: + previous_check = self.helpers.get_previous_set_success_check() + temp_exception = amdsmi_exception.AmdSmiLibraryException(previous_check) + str_out = temp_exception.get_error_info(detailed=False) + result = f"[{str_out}] Unable to successfully restart driver" + except amdsmi_exception.AmdSmiLibraryException as e: + # Handle permission error FIRST, before any cleanup + self.helpers.assign_previous_set_success_check(e.get_error_code()) + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + self.helpers.assign_previous_set_success_check(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM) + result = f"[{e.get_error_info(detailed=False)}] Command requires elevation" + # Clean termination of progress bar + if progress_process and progress_process.is_alive(): + progress_process.terminate() + progress_process.join(timeout=0.1) # Wait max 0.1 second + if progress_process.is_alive(): + progress_process.kill() # Force kill if needed + print("\n") # Clean up progress bar line + # Store result and exit early + self.logger.store_output(args.gpu, 'reload_driver', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + if not is_lock_released: + lock.release() + is_lock_released = True + raise PermissionError('Command requires elevation') from e + else: + # Handle other errors + self.helpers.assign_previous_set_success_check(e.get_error_code()) + result = f"[{e.get_error_info(detailed=False)}] Unable to successfully restart driver" + finally: + # Always clean up progress bar process + if progress_process and progress_process.is_alive(): + progress_process.terminate() + progress_process.join(timeout=0.1) + if progress_process.is_alive(): + progress_process.kill() + print("\n") # Clean up progress bar line + # Always release lock + if not is_lock_released: + lock.release() + is_lock_released = True + # Store and print result + self.logger.store_output(args.gpu, 'reload_driver', result) + self.logger.print_output() + self.logger.clear_multiple_devices_output() + return + + def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, + watch=None, watch_time=None, iterations=None, power_usage=None, + temperature=None, gfx_util=None, mem_util=None, encoder=None, + decoder=None, ecc=None, vram_usage=None, pcie=None, process=None, + violation=None): + """ Populate a table with each GPU as an index to rows of targeted data + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + watch (bool, optional): Value override for args.watch. Defaults to None. + watch_time (int, optional): Value override for args.watch_time. Defaults to None. + iterations (int, optional): Value override for args.iterations. Defaults to None. + power_usage (bool, optional): Value override for args.power_usage. Defaults to None. + temperature (bool, optional): Value override for args.temperature. Defaults to None. + gfx (bool, optional): Value override for args.gfx. Defaults to None. + mem_util (bool, optional): Value override for args.mem. Defaults to None. + encoder (bool, optional): Value override for args.encoder. Defaults to None. + decoder (bool, optional): Value override for args.decoder. Defaults to None. + ecc (bool, optional): Value override for args.ecc. Defaults to None. + vram_usage (bool, optional): Value override for args.vram_usage. Defaults to None. + pcie (bool, optional): Value override for args.pcie. Defaults to None. + process (bool, optional): Value override for args.process. Defaults to None. + violation (bool, optional): Value override for args.violation. Defaults to None. + + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if watch: + args.watch = watch + if watch_time: + args.watch_time = watch_time + if iterations: + args.iterations = iterations + + # monitor args + if power_usage: + args.power_usage = power_usage + if temperature: + args.temperature = temperature + if gfx_util: + args.gfx = gfx_util + if mem_util: + args.mem = mem_util + if encoder: + args.encoder = encoder + if decoder: + args.decoder = decoder + if ecc: + args.ecc = ecc + if vram_usage: + args.vram_usage = vram_usage + if pcie: + args.pcie = pcie + if process: + args.process = process + if not self.helpers.is_virtual_os(): + if violation: + args.violation = violation + else: + args.violation = False # Disable violation for virtual OS + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # If all arguments are False, the print all values + # Don't include process in this logic as it's an optional edge case + if not any([args.power_usage, args.temperature, args.gfx, args.mem, + args.encoder, args.decoder, args.ecc, args.vram_usage, + args.pcie, args.violation]): + args.power_usage = args.temperature = args.gfx = args.mem = \ + args.encoder = args.decoder = args.vram_usage = True + # set extra args for default output filtering + args.default_output = True + else: + if not hasattr(args, 'default_output'): + args.default_output = False + + # Handle watch logic, will only enter this block once + if args.watch: + self.helpers.handle_watch(args=args, subcommand=self.monitor, logger=self.logger) + return + + # Handle multiple GPUs + if isinstance(args.gpu, list): + if len(args.gpu) > 1: + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) + + # Store output from multiple devices without printing to console + for device_handle in args.gpu: + self.monitor(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + dual_csv_output = False + if args.process: + if self.logger.is_csv_format(): + dual_csv_output = True + + # Flush the output + self.logger.print_output(multiple_device_enabled=True, + watching_output=watching_output, + tabular=True, + dual_csv_output=dual_csv_output) + + # Add output to total watch output and clear multiple device output + if watching_output: + self.logger.store_watch_output(multiple_device_enabled=True) + + return + elif len(args.gpu) == 1: + args.gpu = args.gpu[0] + else: + raise IndexError("args.gpu should not be an empty list") + + monitor_values = {} + + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + + # Reset the table header and store the timestamp if watch output is enabled + self.logger.table_header = 'GPU' + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.table_header + + if args.loglevel == "DEBUG": + try: + # Get GPU Metrics table version + gpu_metric_version_info = amdsmi_interface.amdsmi_get_gpu_metrics_header_info(args.gpu) + gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) + logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("#4 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.get_error_info()) + + try: + # Get GPU Metrics table + gpu_metric_debug_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("#5 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + + is_partition_metrics = False # True if we get the metrics from xcp_metrics file (amdsmi_get_gpu_partition_metrics_info) + #get metric info only once per gpu, this will speed up data output + try: + # Get GPU Metrics table + gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + if args.loglevel == "DEBUG": + gpu_metric_debug_info = json.dumps(gpu_metrics_info, indent=4) + logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, gpu_metric_debug_info) + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_metrics_info = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() + logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + + # Workaround for XCP (partition) metrics not providing num_partition in v1.9+/v1.1+ + # Provides original formatting for earlier metric versions + partition_metric_info = self.helpers._get_metric_version_and_partition_info(gpu_metrics_info, is_partition_metrics, gpu_id, args.gpu) + partition_id = partition_metric_info['partition_id'] + num_partition = partition_metric_info['num_partition'] + + # Update logger for XCP display (only if applicable) + self.logger.table_header += 'XCP'.rjust(5, ' ') + self.logger.store_output(args.gpu, 'xcp', partition_id) # Store partition_id initially; can be updated via num_xcp + + # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls + if args.pcie: + try: + pcie_info = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + except amdsmi_exception.AmdSmiLibraryException as e: + pcie_info = "N/A" + logging.debug("Failed to get pci bandwidth on gpu %s | %s", gpu_id, e.get_error_info()) + + power_unit = 'W' + + # Resume regular ordering of values + if args.power_usage: + try: + if gpu_metrics_info['current_socket_power'] != "N/A": + monitor_values['power_usage'] = gpu_metrics_info['current_socket_power'] + else: # Fallback to average_socket_power for older gpu_metrics versions + monitor_values['power_usage'] = gpu_metrics_info['average_socket_power'] + + if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A": + monitor_values['power_usage'] = f"{monitor_values['power_usage']} {power_unit}" + if self.logger.is_json_format() and monitor_values['power_usage'] != "N/A": + monitor_values['power_usage'] = {"value" : monitor_values['power_usage'], + "unit" : power_unit} + + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['power_usage'] = "N/A" + logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'POWER'.rjust(7) + + if args.power_usage and not args.default_output: + # Get Current Power Cap + try: + # assume that we're always asking for ppt0 for quick checks like this + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu, 0) + monitor_values['max_power'] = power_cap_info['power_cap'] # Get current power cap (`power_cap`) socket is set to + # `max_power_cap`, is the maximum value it can be set to + monitor_values['max_power'] = self.helpers.convert_SI_unit(monitor_values['max_power'], AMDSMIHelpers.SI_Unit.MICRO) + + if self.logger.is_human_readable_format() and monitor_values['max_power'] != "N/A": + monitor_values['max_power'] = f"{monitor_values['max_power']} {power_unit}" + if self.logger.is_json_format() and monitor_values['max_power'] != "N/A": + monitor_values['max_power'] = {"value" : monitor_values['max_power'], + "unit" : power_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['max_power'] = "N/A" + logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'PWR_CAP'.rjust(9) + + if args.temperature: + try: + temperature = gpu_metrics_info['temperature_hotspot'] + monitor_values['hotspot_temperature'] = temperature + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['hotspot_temperature'] = "N/A" + logging.debug("Failed to get hotspot temperature on gpu %s | %s", gpu_id, e) + + try: + temperature = gpu_metrics_info['temperature_mem'] + monitor_values['memory_temperature'] = temperature + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['memory_temperature'] = "N/A" + logging.debug("Failed to get memory temperature on gpu %s | %s", gpu_id, e) + + temp_unit_human_readable = '\N{DEGREE SIGN}C' + temp_unit_json = 'C' + if monitor_values['hotspot_temperature'] != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['hotspot_temperature'] = f"{monitor_values['hotspot_temperature']} {temp_unit_human_readable}" + if self.logger.is_json_format(): + monitor_values['hotspot_temperature'] = {"value" : monitor_values['hotspot_temperature'], + "unit" : temp_unit_json} + if monitor_values['memory_temperature'] != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['memory_temperature'] = f"{monitor_values['memory_temperature']} {temp_unit_human_readable}" + if self.logger.is_json_format(): + monitor_values['memory_temperature'] = {"value" : monitor_values['memory_temperature'], + "unit" : temp_unit_json} + + self.logger.table_header += 'GPU_T'.rjust(8) + self.logger.table_header += 'MEM_T'.rjust(8) + + if args.gfx: + try: + gfx_clk = gpu_metrics_info['current_gfxclk'] + monitor_values['gfx_clk'] = gfx_clk + freq_unit = 'MHz' + if gfx_clk != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['gfx_clk'] = f"{monitor_values['gfx_clk']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['gfx_clk'] = {"value" : monitor_values['gfx_clk'], + "unit" : freq_unit} + + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['gfx_clk'] = "N/A" + logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'GFX_CLK'.rjust(10) + + try: + gfx_util = gpu_metrics_info['average_gfx_activity'] + activity_unit = '%' + if gfx_util != "N/A": + monitor_values['gfx'] = gfx_util + if self.logger.is_human_readable_format(): + monitor_values['gfx'] = f"{monitor_values['gfx']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['gfx'] = {"value" : monitor_values['gfx'], + "unit" : activity_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['gfx'] = "N/A" + logging.debug("Failed to get gfx utilization on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'GFX%'.rjust(7) + + if args.mem: + try: + mem_util = gpu_metrics_info['average_umc_activity'] + activity_unit = '%' + if mem_util != "N/A": + monitor_values['mem'] = mem_util + if self.logger.is_human_readable_format(): + monitor_values['mem'] = f"{monitor_values['mem']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['mem'] = {"value" : monitor_values['mem'], + "unit" : activity_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['mem'] = "N/A" + logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'MEM%'.rjust(7) + + # don't populate mem clock on default output + if not args.default_output: + try: + mem_clock = gpu_metrics_info['current_uclk'] + monitor_values['mem_clock'] = mem_clock + freq_unit = 'MHz' + if mem_clock != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['mem_clock'] = {"value" : monitor_values['mem_clock'], + "unit" : freq_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['mem_clock'] = "N/A" + logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'MEM_CLOCK'.rjust(11) + + if args.encoder: + # TODO: The encoding utilization is in progress for Navi. Note: MI3x ASICs only support decoding. + try: + # Get List of vcn activity values + encoder_util = "N/A" # Not yet implemented + encoding_activity_avg = [] + for value in encoder_util: + if isinstance(value, int): + encoding_activity_avg.append(value) + + # Averaging the possible encoding activity values + if encoding_activity_avg: + encoding_activity_avg = round(sum(encoding_activity_avg) / len(encoding_activity_avg)) + else: + encoding_activity_avg = "N/A" + + monitor_values['encoder'] = encoding_activity_avg + + activity_unit = '%' + if monitor_values['encoder'] != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['encoder'] = f"{monitor_values['encoder']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['encoder'] = {"value" : monitor_values['encoder'], + "unit" : activity_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['encoder'] = "N/A" + logging.debug("Failed to get encoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'ENC%'.rjust(7) + + if args.decoder: + try: + # Get List of vcn activity values + # Note: MI3x ASICs only support decoding, so the vcn_activity/vcn_busy + # is used for decoding activity. + decoder_util = gpu_metrics_info['vcn_activity'] + if (gpu_metrics_info['vcn_activity'][0] == "N/A" and + gpu_metrics_info['xcp_stats.vcn_busy'][partition_id][0] != "N/A"): + decoder_util = gpu_metrics_info['xcp_stats.vcn_busy'][partition_id] + decoding_activity_avg = self.helpers.average_flattened_ints(decoder_util, context="decoder_util") + monitor_values['decoder'] = decoding_activity_avg + + activity_unit = '%' + if monitor_values['decoder'] != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['decoder'] = f"{monitor_values['decoder']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['decoder'] = {"value" : monitor_values['decoder'], + "unit" : activity_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['decoder'] = "N/A" + logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'DEC%'.rjust(7) + + if (args.encoder or args.decoder) and not args.default_output: + try: + vclock = gpu_metrics_info['current_vclk0'] + monitor_values['vclock'] = vclock + + freq_unit = 'MHz' + if vclock != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['vclock'] = f"{monitor_values['vclock']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['vclock'] = {"value" : monitor_values['vclock'], + "unit" : freq_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['vclock'] = "N/A" + logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'VCLOCK'.rjust(10) + + try: + dclock = gpu_metrics_info['current_dclk0'] + monitor_values['dclock'] = dclock + + freq_unit = 'MHz' + if dclock != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['dclock'] = f"{monitor_values['dclock']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['dclock'] = {"value" : monitor_values['dclock'], + "unit" : freq_unit} + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + monitor_values['dclock'] = "N/A" + logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e) + + self.logger.table_header += 'DCLOCK'.rjust(10) + + if args.ecc: + try: + ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) + monitor_values['single_bit_ecc'] = ecc['correctable_count'] + monitor_values['double_bit_ecc'] = ecc['uncorrectable_count'] + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['ecc'] = "N/A" + logging.debug("Failed to get ecc on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'SINGLE_ECC'.rjust(12) + self.logger.table_header += 'DOUBLE_ECC'.rjust(12) + + try: + pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) + monitor_values['pcie_replay'] = pcie_metric['pcie_replay_count'] + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['pcie_replay'] = "N/A" + logging.debug("Failed to get gpu_metrics pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info()) + + if monitor_values['pcie_replay'] == "N/A": + try: + pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) + monitor_values['pcie_replay'] = pcie_replay + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get sysfs pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'PCIE_REPLAY'.rjust(13) + + if args.vram_usage and not args.default_output: + try: + vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + monitor_values['vram_used'] = vram_used + monitor_values['vram_free'] = vram_total - vram_used + monitor_values['vram_total'] = vram_total + if vram_total != 0: + monitor_values['vram_percent'] = round ((vram_used / vram_total) * 100, 2) + else: + monitor_values['vram_percent'] = "N/A" + + vram_usage_unit = "MB" + vram_percent_unit = "%" + if self.logger.is_human_readable_format(): + monitor_values['vram_used'] = f"{monitor_values['vram_used']} {vram_usage_unit}" + monitor_values['vram_free'] = f"{monitor_values['vram_free']} {vram_usage_unit}" + monitor_values['vram_total'] = f"{monitor_values['vram_total']} {vram_usage_unit}" + monitor_values['vram_percent'] = f"{monitor_values['vram_percent']} {vram_percent_unit}" + if self.logger.is_json_format(): + monitor_values['vram_used'] = {"value" : monitor_values['vram_used'], + "unit" : vram_usage_unit} + monitor_values['vram_free'] = {"value" : monitor_values['vram_free'], + "unit" : vram_usage_unit} + monitor_values['vram_total'] = {"value" : monitor_values['vram_total'], + "unit" : vram_usage_unit} + monitor_values['vram_percent'] = {"value" : monitor_values['vram_percent'], + "unit" : vram_percent_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['vram_used'] = "N/A" + monitor_values['vram_free'] = "N/A" + monitor_values['vram_total'] = "N/A" + monitor_values['vram_percent'] = "N/A" + logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'VRAM_USED'.rjust(11) + self.logger.table_header += 'VRAM_FREE'.rjust(12) + self.logger.table_header += 'VRAM_TOTAL'.rjust(12) + self.logger.table_header += 'VRAM%'.rjust(9) + + if args.vram_usage and args.default_output: + try: + vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + vram_usage_unit = "GB" + if self.logger.is_json_format(): + monitor_values['vram_used'] = {"value" : round(vram_used/1024,1), + "unit" : vram_usage_unit} + monitor_values['vram_total'] = {"value" : round(vram_total/1024,1), + "unit" : vram_usage_unit} + elif self.logger.is_csv_format(): + monitor_values['vram_used'] = round(vram_used/1024,1) + monitor_values['vram_total'] = round(vram_total/1024,1) + else: + monitor_values['vram_usage'] = f"{vram_used/1024:5.1f}/{vram_total/1024:5.1f} {vram_usage_unit}".rjust(16,' ') + except amdsmi_exception.AmdSmiLibraryException as e: + if self.logger.is_json_format(): + monitor_values['vram_used'] = "N/A" + monitor_values['vram_total'] = "N/A" + else: + monitor_values['vram_usage'] = "N/A" + logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'VRAM_USAGE'.rjust(16) + + if args.pcie: + if pcie_info != "N/A": + pcie_bw_unit = 'Mb/s' + monitor_values['pcie_bw'] = self.helpers.unit_format(self.logger, pcie_info['pcie_bandwidth'], pcie_bw_unit) + else: + monitor_values['pcie_bw'] = pcie_info + + self.logger.table_header += 'PCIE_BW'.rjust(12) + + # initialize dual_csv_format; applicable to process only + dual_csv_output = False + + # Store process list separately + if args.process: + # Populate initial processes + try: + process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) + raise e + + try: + num_compute_units = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)['num_compute_units'] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + num_compute_units = "N/A" + logging.debug("Failed to get num compute units for gpu %s | %s", gpu_id, e.get_error_info()) + + # Clean processes dictionary + filtered_process_values = [] + for process_info in process_list: + process_info.pop('engine_usage') # Remove 'engine_usage' value + process_info['mem_usage'] = process_info.pop('mem') + process_info['cu_occupancy'] = process_info.pop('cu_occupancy') + process_info['evicted_time'] = process_info.pop('evicted_time') + + memory_usage_unit = "B" + evicted_time_unit = "ms" + + if self.logger.is_human_readable_format(): + process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage']) + for usage_metric in process_info['memory_usage']: + process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric]) + memory_usage_unit = "" + + process_info['mem_usage'] = self.helpers.unit_format(self.logger, + process_info['mem_usage'], + memory_usage_unit) + + process_info['evicted_time'] = self.helpers.unit_format(self.logger, + process_info['evicted_time'], + evicted_time_unit) + + for usage_metric in process_info['memory_usage']: + process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger, + process_info['memory_usage'][usage_metric], + memory_usage_unit) + + if 'cu_occupancy' in process_info: + try: + cu_occupancy = process_info['cu_occupancy'] + if num_compute_units != "N/A" and num_compute_units > 0 and cu_occupancy != "N/A": + cu_percentage = round((cu_occupancy / num_compute_units) * 100, 1) + process_info['cu_occupancy'] = self.helpers.unit_format(self.logger, + cu_percentage, + '%') + else: + process_info['cu_occupancy'] = "N/A" + except Exception as e: + process_info['cu_occupancy'] = "N/A" + logging.debug("Failed to calculate cu_occupancy percentage for GPU %s | %s", gpu_id, str(e)) + + filtered_process_values.append({'process_info': process_info}) + + # If no processes are populated then we populate an N/A placeholder + if not filtered_process_values: + logging.debug("Monitor - Failed to detect any process on gpu %s", gpu_id) + filtered_process_values.append({'process_info': "N/A"}) + + for index, process in enumerate(filtered_process_values): + if process['process_info'] == "N/A": + filtered_process_values[index]['process_info'] = "No running processes detected" + + # Build the process table's title and header + self.logger.secondary_table_title = "PROCESS INFO" + self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \ + "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "EVICT".rjust(10) + + if watching_output: + self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header + + logging.debug(f"Monitor - Process Info for GPU {gpu_id} | {filtered_process_values}") + + if self.logger.is_json_format(): + self.logger.store_output(args.gpu, 'process_list', filtered_process_values) + + if self.logger.is_human_readable_format(): + # Print out process in flattened format + # The logger detects if process list is present and pulls it out and prints + # that table with timestamp, gpu, and prints headers separately + self.logger.store_output(args.gpu, 'process_list', filtered_process_values) + + if self.logger.is_csv_format(): + dual_csv_output = True + # The logger detects if process list is present and pulls it out and prints + # that table with timestamp, gpu, and prints headers separately + self.logger.store_output(args.gpu, 'process_list', filtered_process_values) + + ################### + ### XCP Metrics ### + ################### + # Must come after process list - XCP detail is a multi-dimensional array, which is displayed + # in tabular format with XCP values for same gpu shown on muliple lines. + if args.violation: + violation_status = { + "pviol": "N/A", + "tviol": "N/A", + "tviol_active": "N/A", + "phot_tviol": "N/A", + "vr_tviol": "N/A", + "hbm_tviol": "N/A", + "gfx_clkviol": "N/A", + "gfxclk_pviol": "N/A", + "gfxclk_tviol": "N/A", + "gfxclk_totalviol": "N/A", + "low_utilviol": "N/A" + } + try: + violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu) + violation_status['pviol'] = violations['per_ppt_pwr'] + violation_status['tviol'] = violations['per_socket_thrm'] + violation_status['tviol_active'] = violations['active_socket_thrm'] + violation_status['phot_tviol'] = violations['per_prochot_thrm'] + violation_status['vr_tviol'] = violations['per_vr_thrm'] + violation_status['hbm_tviol'] = violations['per_hbm_thrm'] + violation_status['gfx_clkviol'] = violations['per_gfx_clk_below_host_limit'] + violation_status['gfxclk_pviol'] = violations['per_gfx_clk_below_host_limit_pwr'] + violation_status['gfxclk_tviol'] = violations['per_gfx_clk_below_host_limit_thm'] + violation_status['gfxclk_totalviol'] = violations['per_gfx_clk_below_host_limit_total'] + violation_status['low_utilviol'] = violations['per_low_utilization'] + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['pviol'] = violation_status['pviol'] + monitor_values['tviol'] = violation_status['tviol'] + monitor_values['tviol_active'] = violation_status['tviol_active'] + monitor_values['phot_tviol'] = violation_status['phot_tviol'] + monitor_values['vr_tviol'] = violation_status['vr_tviol'] + monitor_values['hbm_tviol'] = violation_status['hbm_tviol'] + monitor_values['gfx_clkviol'] = violation_status['gfx_clkviol'] + monitor_values['gfxclk_pviol'] = violation_status['gfxclk_pviol'] + monitor_values['gfxclk_tviol'] = violation_status['gfxclk_tviol'] + monitor_values['gfxclk_totalviol'] = violation_status['gfxclk_totalviol'] + monitor_values['low_utilviol'] = violation_status['low_utilviol'] + logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info()) + violation_status_unit = "%" + kPVIOL_MAX_WIDTH = 7 + kTVIOL_MAX_WIDTH = 7 + kTVIOL_ACTIVE_MAX_WIDTH = 14 + kPHOT_MAX_WIDTH = 12 + kVR_MAX_WIDTH = 10 + kHBM_MAX_WIDTH = 11 + kGFXC_MAX_WIDTH = 13 + kGFXC_PVIOL_MAX_WIDTH = 58 + kGFXC_TVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH + kGFXC_TOTALVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH + kLOW_UTILVIOL_MAX_WIDTH = kGFXC_PVIOL_MAX_WIDTH + + for key, value in violation_status.items(): + if not isinstance(value, list): + if value != "N/A": + if key == 'tviol_active' or key == 'xcp': + monitor_values[key] = value + else: + monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) + else: + monitor_values[key] = violation_status[key] + else: + if num_partition != "N/A": + # these are one after another, in order to display each in sub-sections + new_xcp_dict = {} + for current_xcp in range(num_partition): + new_xcp_dict[f"xcp_{current_xcp}"] = self.helpers.unit_format(self.logger, value[current_xcp], "%") + monitor_values[key] = new_xcp_dict + else: + monitor_values[key] = value[0] if value else "N/A" + # save deep copy of monitor values, used later to grab xcp specific values + monitor_values_deepcopy = copy.deepcopy(monitor_values) + + self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ') + self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ') + self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ') + self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ') + self.logger.table_header += 'GFX_CLKVIOL'.rjust(kGFXC_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_PVIOL'.rjust(kGFXC_PVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_TVIOL'.rjust(kGFXC_TVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'GFXCLK_TOTALVIOL'.rjust(kGFXC_TOTALVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'LOW_UTILVIOL'.rjust(kLOW_UTILVIOL_MAX_WIDTH, ' ') + + # Print/capture by XCPs + if num_partition != "N/A" and partition_id == 0: + current_xcp = 0 + while (current_xcp in range(num_partition) or current_xcp == 0): + if not multiple_devices and watching_output and current_xcp == 0: + # Need to clear output for single device, otherwise while watching output + # XCP detail will continue stacking on top of each other + self.logger.clear_multiple_devices_output() + + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + + if current_xcp != 0: # set all other values without XCP stats to N/A + self.logger.store_output(args.gpu, 'xcp', current_xcp) + monitor_values['pviol'] = "N/A" + monitor_values['tviol'] = "N/A" + monitor_values['tviol_active'] = "N/A" + monitor_values['phot_tviol'] = "N/A" + monitor_values['vr_tviol'] = "N/A" + monitor_values['hbm_tviol'] = "N/A" + monitor_values['gfx_clkviol'] = "N/A" + for k, _ in monitor_values.items(): # change other keys to "N/A" since we should have all applicable XCP stats + # eg. amd-smi monitor -p -t -V should only show XCP info for violations + # below primary device + if k != 'xcp' and k not in ['gfxclk_pviol', 'gfxclk_tviol', 'gfxclk_totalviol', 'low_utilviol']: + monitor_values[k] = "N/A" + + if isinstance(monitor_values_deepcopy['gfxclk_pviol'], dict): + monitor_values['gfxclk_pviol'] = monitor_values_deepcopy['gfxclk_pviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['gfxclk_tviol'], dict): + monitor_values['gfxclk_tviol'] = monitor_values_deepcopy['gfxclk_tviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['gfxclk_totalviol'], dict): + monitor_values['gfxclk_totalviol'] = monitor_values_deepcopy['gfxclk_totalviol'][f"xcp_{current_xcp}"] + if isinstance(monitor_values_deepcopy['low_utilviol'], dict): + monitor_values['low_utilviol'] = monitor_values_deepcopy['low_utilviol'][f"xcp_{current_xcp}"] + + if self.logger.is_human_readable_format(): + monitor_values['pviol'] = monitor_values['pviol'] + monitor_values['tviol'] = monitor_values['tviol'] + monitor_values['phot_tviol'] = monitor_values['phot_tviol'] + monitor_values['vr_tviol'] = monitor_values['vr_tviol'] + monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'] + monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'] + monitor_values['gfxclk_pviol'] = str(monitor_values['gfxclk_pviol']).replace('\'', '') + monitor_values['gfxclk_tviol'] = str(monitor_values['gfxclk_tviol']).replace('\'', '') + monitor_values['gfxclk_totalviol'] = str(monitor_values['gfxclk_totalviol']).replace('\'', '') + monitor_values['low_utilviol'] = str(monitor_values['low_utilviol']).replace('\'', '') + self.logger.store_output(args.gpu, 'values', monitor_values) + self.logger.store_multiple_device_output() + current_xcp += 1 + else: + self.logger.store_output(args.gpu, 'xcp', partition_id) + self.logger.store_output(args.gpu, 'values', monitor_values) + + # Store typical output for all commands (XCP data will be handled separately, eg. violation status) + if not args.violation: + self.logger.store_output(args.gpu, 'values', monitor_values) + + # Now handling the single gpu case only + if multiple_devices: + self.logger.store_multiple_device_output() + return + + if watching_output and not self.logger.destination == "stdout": # End of single gpu add to watch_output + self.logger.store_watch_output(multiple_device_enabled=False) + + + if args.violation: + # Print violation status for single gpu, which have different xcp information per 1 gpu + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output) + else: + # Print the output for single gpu, which currently does not have multiple xcp information + self.logger.print_output(multiple_device_enabled=False, watching_output=watching_output, tabular=True, dual_csv_output=dual_csv_output) + + + def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_source_status=None, xgmi_link_status=None): + """ Get topology information for target gpus + params: + args - argparser args to pass to subcommand + multiple_devices (bool) - True if checking for multiple devices + gpu (device_handle) - device_handle for target device + metric (bool) - Value override for args.metric + xgmi_source_status (bool) - Value override for args.xgmi_source_status + xgmi_link_status (bool) - Value override for args.xgmi_link_status + + return: + Nothing + """ + # Not supported with partitions + + # Set args.* to passed in arguments + if gpu: + args.gpu = gpu + if metric: + args.metric = metric + if xgmi_link_status: + args.link_status = xgmi_link_status + if xgmi_source_status: + args.source_status = xgmi_source_status + + # Handle No GPU passed + if args.gpu == None: + args.gpu = self.device_handles + + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + + # Handle all args being false + if not any([args.metric, args.link_status, args.source_status]): + args.metric = True + args.link_status = True + args.source_status = True + + # Clear the table header + self.logger.table_header = ''.rjust(7) + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Populate the possible gpus and their bdfs + xgmi_values = [] + for gpu in args.gpu: + primary_partition = self.helpers.is_primary_partition(gpu) + if not primary_partition: + logging.debug(f"Skipping xgmi command due to non zero partition {gpu}") + continue + + logging.debug("check1 device_handle: %s", gpu) + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) + xgmi_values.append({"gpu" : gpu_id, + "bdf" : gpu_bdf}) + # Populate header with just it's gpu_id + self.logger.table_header += f"GPU{gpu_id}".rjust(13) + + # Cache processor handles for each BDF + src_gpu_handles = {} + for dict in xgmi_values: + try: + src_gpu_handles[dict['bdf']] = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(dict['bdf']) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get processor handle for %s | %s", dict['bdf'], e.get_error_info()) + src_gpu_handles[dict['bdf']] = None + if args.metric: + # prepend link metrics header to the table header + link_metrics_header = " " + "bdf".ljust(14) + \ + "bit_rate".ljust(10) + "max_bandwidth".ljust(15) + \ + "link_type".ljust(11) + self.logger.table_header = link_metrics_header + self.logger.table_header.strip() + + # Populate dictionary according to format + for xgmi_dict in xgmi_values: + src_gpu_id = xgmi_dict['gpu'] + src_gpu_bdf = xgmi_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + logging.debug("check2 device_handle: %s", src_gpu) + # This should be the same order as the check1 + + xgmi_dict['link_metrics'] = { + "bit_rate" : "N/A", + "max_bandwidth" : "N/A", + "link_type" : "N/A", + "links" : [] + } + xgmi_metrics_info = {"links": []} + + try: + xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu) + bitrate = xgmi_metrics_info['links'][0]['bit_rate'] + max_bandwidth = xgmi_metrics_info['links'][0]['max_bandwidth'] + except amdsmi_exception.AmdSmiLibraryException as e: + bitrate = "N/A" + max_bandwidth = "N/A" + logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id, + e.get_error_info()) + + # Populate bitrate and max_bandwidth with units logic + bw_unit = 'Gb/s' + if self.logger.is_human_readable_format(): + xgmi_dict['link_metrics']['bit_rate'] = f"{bitrate} {bw_unit}" + xgmi_dict['link_metrics']['max_bandwidth'] = f"{max_bandwidth} {bw_unit}" + elif self.logger.is_json_format(): + xgmi_dict['link_metrics']['bit_rate'] = {"value" : bitrate, + "unit" : bw_unit} + xgmi_dict['link_metrics']['max_bandwidth'] = {"value" : max_bandwidth, + "unit" : bw_unit} + elif self.logger.is_csv_format(): + xgmi_dict['link_metrics']['bit_rate'] = bitrate + xgmi_dict['link_metrics']['max_bandwidth'] = max_bandwidth + + # Populate link metrics + for dest_gpu in args.gpu: + primary_partition = self.helpers.is_primary_partition(dest_gpu) + if not primary_partition: + continue + + dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) + dest_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu) + dest_link_dict = { + "gpu" : dest_gpu_id, + "bdf" : dest_gpu_bdf, + "read" : 0, + "write" : 0, + } + + found = False + for link in xgmi_metrics_info['links']: + if link['bdf'] == dest_gpu_bdf: + # Accumulate read/write if multiple links have the same bdf + dest_link_dict['read'] += link['read'] + dest_link_dict['write'] += link['write'] + found = True + if not found: + dest_link_dict['read'] = "N/A" + dest_link_dict['write'] = "N/A" + else: + data_unit = 'KB' + if self.logger.is_human_readable_format(): + dest_link_dict['read'] = self.helpers.convert_bytes_to_readable(dest_link_dict['read'] * 1024, True) + dest_link_dict['write'] = self.helpers.convert_bytes_to_readable(dest_link_dict['write'] * 1024, True) + elif self.logger.is_json_format(): + dest_link_dict['read'] = {"value" : dest_link_dict['read'], + "unit" : data_unit} + dest_link_dict['write'] = {"value" : dest_link_dict['write'], + "unit" : data_unit} + + try: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if xgmi_dict['link_metrics']['link_type'] != "XGMI" and isinstance(link_type, int): + if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_INTERNAL: + xgmi_dict['link_metrics']['link_type'] = "UNKNOWN" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_PCIE: + xgmi_dict['link_metrics']['link_type'] = "PCIE" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_LINK_TYPE_XGMI: + xgmi_dict['link_metrics']['link_type'] = "XGMI" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get link type for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + xgmi_dict['link_metrics']['links'].append(dest_link_dict) + + # Handle printing for tabular format + if self.logger.is_human_readable_format(): + # Populate tabular output + tabular_output = [] + for xgmi_dict in xgmi_values: + tabular_output_dict = {} + + # Create GPU row and add to tabular_output + for key, value in xgmi_dict.items(): + if key == "gpu": + tabular_output_dict["gpu#"] = f"GPU{value}" + if key == "bdf": + tabular_output_dict["bdf"] = value + if key == "link_metrics": + for link_key, link_value in value.items(): + if link_key == "bit_rate": + tabular_output_dict["bit_rate"] = link_value + if link_key == "max_bandwidth": + tabular_output_dict["max_bandwidth"] = link_value + if link_key == "link_type": + tabular_output_dict["link_type"] = link_value + tabular_output.append(tabular_output_dict) + + # Create Read and Write rows and add to tabular_output + read_output_dict = {"RW" : " Read"} + write_output_dict = {"RW" : " Write"} + for key, value in xgmi_dict.items(): + if key == "link_metrics": + for link_key, link_value in value.items(): + if link_key == "links": + for link in link_value: + read_output_dict[f"bdf_{link['gpu']}"] = link["read"] + write_output_dict[f"bdf_{link['gpu']}"] = link["write"] + tabular_output.append(read_output_dict) + tabular_output.append(write_output_dict) + + # Print out the tabular output + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nLINK METRIC TABLE" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + self.logger.multiple_device_output = xgmi_values + + if self.logger.is_csv_format(): + new_output = [] + for elem in self.logger.multiple_device_output: + new_output.append(self.logger.flatten_dict(elem, topology_override=True)) + self.logger.multiple_device_output = new_output + + if self.logger.is_json_format(): + self.logger.store_xgmi_metric_json_output.append(xgmi_values) + if not any([args.link_status, args.source_status]): + self.logger.combine_arrays_to_json() + elif not self.logger.is_human_readable_format(): + self.logger.print_output(multiple_device_enabled=True) + + if args.source_status: + # Header modification + self.logger.table_header = ''.rjust(7) + current_header = " ".ljust(7) + \ + "bdf".ljust(14) + \ + "port_num".ljust(20) + self.logger.table_header = current_header + self.logger.table_header.strip() + # Process each GPU + tabular_output = [] + for xgmi_dict in xgmi_values: + src_gpu_id = xgmi_dict['gpu'] + src_gpu_bdf = xgmi_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + + # Populate link statuses + tabular_output_dict = {"gpu#": f"GPU{src_gpu_id}", + "gpu": src_gpu_id, + "bdf": src_gpu_bdf, + "link_status": "N/A"} + try: + link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu) + tabular_output_dict['link_status'] = link_status['status'] + if self.logger.is_human_readable_format(): + del tabular_output_dict['gpu'] + else: + del tabular_output_dict['gpu#'] + tabular_output.append(tabular_output_dict) + if self.logger.is_json_format(): + self.logger.store_xgmi_source_status_json_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + xgmi_dict['link_metrics']['link_status']={"status": "failed"} + logging.debug("Failed to get XGMI link status for GPU %s | %s", src_gpu_id, e.get_error_info()) + + #populate link status data for output + if self.logger.is_human_readable_format(): + xgmi_dict['link_status'] = tabular_output + self.logger.multiple_device_output= tabular_output + self.logger.table_title = "\nGPU LINK PORT STATUS" + if not self.logger.is_json_format(): + self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.clear_multiple_devices_output() + if self.logger.is_json_format(): + if not args.link_status: + self.logger.combine_arrays_to_json() + + if args.link_status: + # XGMI LINK STATUS for src_gpu to dest_gpu + header = [" ".ljust(8), "bdf".ljust(15)] + [f"GPU{d['gpu']}".ljust(14) for d in xgmi_values] + self.logger.table_header = "".join(header) + self.logger.table_title = "\nXGMI LINK STATUS" + + src_link_status_map = {} + for gpu_dict in xgmi_values: + src_gpu_id = gpu_dict['gpu'] + src_gpu_bdf = gpu_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + try: + link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu) + src_link_status_map[src_gpu_bdf] = link_status['status'] + except amdsmi_exception.AmdSmiLibraryException: + src_link_status_map[src_gpu_bdf] = ["N/A"] * amdsmi_interface.AMDSMI_MAX_NUM_XGMI_LINKS + + tabular_output = [] + for src_xgmi_dict in xgmi_values: + src_gpu_id = src_xgmi_dict['gpu'] + src_gpu_bdf = src_xgmi_dict['bdf'] + src_gpu = src_gpu_handles.get(src_gpu_bdf) + try: + xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu) + except amdsmi_exception.AmdSmiLibraryException: + xgmi_metrics_info = {"links": []} + # First column: GPU# + tab + bdf, then status for each dest bdf + if self.logger.is_human_readable_format(): + row_dict = {"": f"GPU{src_gpu_id}\t{src_gpu_bdf}".ljust(20)} + else: + row_dict = {"gpu": f"GPU{src_gpu_id}", "bdf": src_gpu_bdf} + json_status = [] + # Cache GPU handles for destination GPUs + dest_gpu_handles = {dest_xgmi_dict['bdf']: + amdsmi_interface.amdsmi_get_processor_handle_from_bdf(dest_xgmi_dict['bdf']) + for dest_xgmi_dict in xgmi_values} + for dest_xgmi_dict in xgmi_values: + dest_gpu_bdf = dest_xgmi_dict['bdf'] + dest_gpu = dest_gpu_handles[dest_gpu_bdf] + + # Find all link indexes in xgmi_metrics_info for this destination + link_indexes = [] + for idx, link in enumerate(xgmi_metrics_info['links']): + if link['bdf'] == dest_gpu_bdf: + link_indexes.append(idx) + + # Use the found link index to get the status if valid + if link_indexes and len(link_indexes) <= len(src_link_status_map.get(src_gpu_bdf, [])): + statuses = [] + for link_idx in link_indexes: + if link_idx < len(src_link_status_map[src_gpu_bdf]): + statuses.append(str(src_link_status_map[src_gpu_bdf][link_idx])) + + # Join multiple statuses with "/" + if statuses: + status = "/".join(statuses) + else: + status = "N/A" + elif dest_gpu_bdf == src_gpu_bdf: + status = "SELF" + else: + status = "N/A" + + if self.logger.is_human_readable_format(): + row_dict[dest_gpu_bdf.ljust(14)] = str(status).ljust(14) + else: + row_dict[dest_gpu_bdf] = status + json_status.append(status) + tabular_output.append(row_dict) + if self.logger.is_json_format(): + self.logger.store_xgmi_link_status_json_output.append({ + "gpu": src_gpu_id, + "bdf": src_gpu_bdf, + "link_status": json_status + }) + + if not self.logger.is_json_format(): + self.logger.multiple_device_output = tabular_output + self.logger.print_output(multiple_device_enabled=True, tabular=True) + + self.logger.clear_multiple_devices_output() + + if self.logger.is_json_format(): + self.logger.combine_arrays_to_json() + + if self.logger.is_human_readable_format(): + # Populate the legend output + legend_parts = [ + "\n\nLegend:", + " SELF = Current GPU", + " N/A = Not supported", + " U / D / X = Link is Up / Down / Disabled", + " Read / Write = GPU Metric Accumulated Read / Write" + ] + legend_output = "\n".join(legend_parts) + + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') + + + def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None): + """ Display parition information for the target GPU + param: + args - argparser args to pass to subcommand + multiple_devices (bool) - True if checking for multiple devices + gpu (device_handle) - device_handle for target device + current - boolean which dictates whether the current partition information is shown + memory - boolean which dictates whether the memory partition information is shown + accelerator - boolean which dictates whether the accelerator partition information is shown + returns: + nothing + """ + + if gpu: + args.gpu = gpu + if args.gpu == None: + args.gpu = self.device_handles + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + if current: + args.current = current + if memory: + args.memory = memory + if accelerator: + args.accelerator = accelerator + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + ########################################### + # amd-smi partition (no args) # + ########################################### + # if no args are present, then everything should be displayed + if not args.current and not args.memory and not args.accelerator: + args.current = True + args.memory = True + args.accelerator = True + + ########################################### + # amd-smi partition --current # + ########################################### + if args.current: + self.logger.table_header = ''.rjust(7) + current_header = "GPU_ID".ljust(8) + \ + "MEMORY".ljust(8) + \ + "ACCELERATOR_TYPE".ljust(18) + \ + "ACCELERATOR_PROFILE_INDEX".ljust(27) + \ + "PARTITION_ID".ljust(14) + self.logger.table_header = current_header + self.logger.table_header.strip() + + tabular_output = [] + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + try: + partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") + profile_type = partition_dict['partition_profile']['profile_type'] + profile_index = partition_dict['partition_profile']['profile_index'] + except amdsmi_exception.AmdSmiLibraryException as e: + profile_type = "N/A" + profile_index = "N/A" + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") + logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) + try: + current_mem_cap = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + current_mem_cap = "N/A" + logging.debug("Failed to get current memory partition capabilties for GPU %s | %s", gpu_id, e.get_error_info()) + + if profile_type == 0: + profile_type = "N/A" + + tabular_output_dict = {"gpu_id": gpu_id, + "memory": current_mem_cap, + "accelerator_type": profile_type, + "accelerator_profile_index": profile_index, + "partition_id": partition_id} + tabular_output.append(tabular_output_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "CURRENT_PARTITION" + if self.logger.is_json_format(): + self.logger.store_current_partition_json_output.extend(tabular_output) + if not (args.memory or args.accelerator): + self.logger.combine_arrays_to_json() + else: + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + self.logger.clear_multiple_devices_output() + + ########################################### + # amd-smi partition --memory # + ########################################### + if args.memory: + tabular_output = [] + self.logger.table_header = ''.rjust(7) + current_header = "GPU_ID".ljust(8) + \ + "MEMORY_PARTITION_CAPS".ljust(23) + \ + "CURRENT_MEMORY_PARTITION".ljust(26) + self.logger.table_header = current_header + self.logger.table_header.strip() + + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + mem_caps_str = "N/A" + current_memory_partition = "N/A" + try: + memory_partition_config = amdsmi_interface.amdsmi_get_gpu_memory_partition_config(gpu) + mem_caps_str = str(memory_partition_config['partition_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + current_memory_partition = memory_partition_config['mp_mode'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info()) + + tabular_output_dict = {"gpu_id": gpu_id, + "memory_partition_caps": mem_caps_str, + "current_memory_partition": current_memory_partition} + tabular_output.append(tabular_output_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nMEMORY_PARTITION" + if self.logger.is_json_format(): + self.logger.store_memory_partition_json_output.extend(tabular_output) + if not args.accelerator: + self.logger.combine_arrays_to_json() + else: + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + self.logger.clear_multiple_devices_output() + + ########################################### + # amd-smi partition --accelerator # + ########################################### + if args.accelerator: + self.logger.table_header = ''.rjust(7) + current_header = "GPU_ID".ljust(8) + \ + "PROFILE_INDEX".ljust(15) + \ + "MEMORY_PARTITION_CAPS".ljust(23) + \ + "ACCELERATOR_TYPE".ljust(18) + \ + "PARTITION_ID".ljust(17) + \ + "NUM_PARTITIONS".ljust(16) + \ + "NUM_RESOURCES".ljust(15) + \ + "RESOURCE_INDEX".ljust(16) + \ + "RESOURCE_TYPE".ljust(15) + \ + "RESOURCE_INSTANCES".ljust(20) + \ + "RESOURCES_SHARED".ljust(18) + self.logger.table_header = current_header + self.logger.table_header.strip() + + tabular_output = [] + prev_gpu_id = "N/A" + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + tabular_output_dict = {"gpu_id": gpu_id, + "profile_index": "N/A", + "memory_partition_caps": "N/A", + "accelerator_type": "N/A", + "partition_id": "0", + "num_partitions": "N/A", + "num_resources": "N/A", + "resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} + try: + partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") + current_accelerator_type = partition_dict['partition_profile']['profile_type'] + tabular_output_dict["partition_id"] = partition_id + + # save only the primary GPU node's partition_id (the 1st listed device; non N/A one) + # else keep current_partition_id unchanged for displaying in accelerator resource's output + if partition_id != "N/A": + current_partition_id = partition_id + + except amdsmi_exception.AmdSmiLibraryException as e: + profile_type = "N/A" + profile_index = "N/A" + partition_id = "0" + mem_caps_str = "N/A" + num_partitions = 0 + current_accelerator_type = "N/A" + logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) + + try: + partition_config_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(gpu) + logging.debug("amdsmi_commands.py | partition_config_dict: " + str(json.dumps(partition_config_dict, indent=4))) + num_profiles = partition_config_dict['num_profiles'] + num_resource_profiles = partition_config_dict['num_resource_profiles'] + + resource_index = 0 + prev_accelerator_type = "N/A" + for p in range(0, num_profiles): + accelerator_type = partition_config_dict['profiles'][p]['profile_type'] + profile_index = partition_config_dict['profiles'][p]['profile_index'] + num_partitions = partition_config_dict['profiles'][p]['num_partitions'] + mem_caps_str = str(partition_config_dict['profiles'][p]['memory_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + # 2 modifications based on the current accelerator type: + # 1) display a * for the current accelerator type, otherwise display as normal + # 2) display partition id only for the current accelerator profile (the *'d one) + if current_accelerator_type == accelerator_type: + accelerator_type = accelerator_type + "*" + partition_id = current_partition_id + else: + partition_id = "N/A" + # only display the first instance of the gpu_id, rest are empty strings + if prev_gpu_id != gpu_id: + tabular_gpu_id = gpu_id + prev_gpu_id = gpu_id + else: + tabular_gpu_id = "" + logging.debug("amdsmi_commands.py | tabular_gpu_id: " + str(tabular_gpu_id)) + + if num_resource_profiles == 0: + if prev_accelerator_type != accelerator_type: # only print the first instance of the resources + tabular_output_dict = {"gpu_id": tabular_gpu_id, + "profile_index": profile_index, + "memory_partition_caps": mem_caps_str, + "accelerator_type": accelerator_type, + "partition_id": partition_id, + "num_partitions": num_partitions, + "num_resources": num_resource_profiles, + "resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} + prev_accelerator_type = accelerator_type + tabular_output.append(tabular_output_dict) + continue + + for r in range(0, num_resource_profiles): + logging.debug("amdsmi_commands.py | p: " + str(p) + "; r: " + str(r) + + "; accelerator_type: " + str(accelerator_type)) + resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type'] + resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource'] + resources_shared = partition_config_dict['profiles'][p]['resources'][r]['num_partitions_share_resource'] + if prev_accelerator_type != accelerator_type: # only print the first instance of the resources + tabular_output_dict = {"gpu_id": tabular_gpu_id, + "profile_index": profile_index, + "memory_partition_caps": mem_caps_str, + "accelerator_type": accelerator_type, + "partition_id": partition_id, + "num_partitions": num_partitions, + "num_resources": num_resource_profiles, + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + prev_accelerator_type = accelerator_type + else: + tabular_output_dict = {"gpu_id": "", + "profile_index": "", + "memory_partition_caps": "", + "accelerator_type": "", + "partition_id": "", + "num_partitions": "", + "num_resources": "", + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + resource_index += 1 + tabular_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + tabular_output.append(tabular_output_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nACCELERATOR_PARTITION_PROFILES" + # only display warning message if not running as root or with sudo + if os.geteuid() != 0: + self.logger.warning_message = """ +*************************************************************************** +** WARNING: ** +** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** +** Please run the command with sudo permissions to get accurate results. ** +*************************************************************************** +""" + if self.logger.is_json_format(): + self.logger.store_partition_profiles_json_output.extend(tabular_output) + else: + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + self.logger.clear_multiple_devices_output() + self.logger.warning_message = "" # clear the warning message + + ######################################### + # print accelerator partition resources # + ######################################### + self.logger.table_header = ''.rjust(7) + current_header = "RESOURCE_INDEX".ljust(16) + \ + "RESOURCE_TYPE".ljust(15) + \ + "RESOURCE_INSTANCES".ljust(20) + \ + "RESOURCES_SHARED".ljust(18) + self.logger.table_header = current_header + self.logger.table_header.strip() + + tabular_output = [] + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + tabular_output_dict = {"resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} + try: + partition_config_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(gpu) + logging.debug("amdsmi_commands.py | partition_config_dict: " + str(json.dumps(partition_config_dict, indent=4))) + num_profiles = partition_config_dict['num_profiles'] + num_resource_profiles = partition_config_dict['num_resource_profiles'] + + if num_resource_profiles == 0: + tabular_output.append(tabular_output_dict) + continue + + resource_index = 0 + for p in range(0, num_profiles): + for r in range(0, num_resource_profiles): + resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type'] + resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource'] + resources_shared = partition_config_dict['profiles'][p]['resources'][r]['num_partitions_share_resource'] + tabular_output_dict = { + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + resource_index += 1 + tabular_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + tabular_output.append(tabular_output_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nACCELERATOR_PARTITION_RESOURCES" + if self.logger.is_json_format(): + self.logger.store_partition_resources_json_output.extend(tabular_output) + else: + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + if self.logger.is_json_format(): + self.logger.combine_arrays_to_json() + self.logger.clear_multiple_devices_output() + + if self.logger.is_human_readable_format(): + # print legend + legend_parts = [ + "\n\nLegend:", + " * = Current mode"] + legend_output = "\n".join(legend_parts) + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') + + + def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None, + severity=None, folder=None, file_limit=None, cper_file=None, follow=None): + """ + Retrieve and process CPER (RAS) entries for a target GPU. + + Expected command (all options only): + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file-limit=1000 --follow + + Since no timestamp is provided on the command line, the function starts from a default cursor of 0. + The output file name is auto-generated using the timestamp from the CPER header data (converted from + the header’s "YYYY/MM/DD HH:MM:SS" format), along with the GPU/platform ID and error severity. + """ + + # GPU handle logic. + if gpu: + args.gpu = gpu + if cper: + args.cper = cper + if afid: + args.afid = afid + if severity: + args.severity = severity + if folder: + args.folder = folder + if file_limit: + args.file_limit = file_limit + if cper_file: + args.cper_file = cper_file + if follow: + args.follow = follow + if args.gpu == None: + args.gpu = self.device_handles + + if args.afid: + if args.cper_file: + afids = self.helpers.pvtDumpAfids(args.cper_file) + print(' '.join(map(str, afids))) + return + else: + command = " ".join(sys.argv[1:]) + message = f"Command '{command}' requires '--cper-file'. Run '--help' for more info." + raise AmdSmiInvalidCommandException(command, + self.logger.format, + message) + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=True) + self.group_check_printed = True + + if not args.cper: + return + + if not args.gpu: + return + + if not isinstance(args.gpu, list): + args.gpu = [args.gpu] + + args.cursor = [0] * len(args.gpu) + + # Using all the devices given in args.gpu + # Populate a list of all the primary partition GPU ids (GPU 0, GPU 1, etc) + partition_warning_flag = True + primary_partition_gpu_ids = set() # set of all primary partition GPU ids from arg.gpu + for device_handle in args.gpu: + # First get the partition + partition_id = self.helpers.get_partition_id(device_handle) + # If there is a single primary partition within args.gpu then we don't need to print the warning + if partition_id == 0: + partition_warning_flag = False + break + # Then attempt to get the primary GPU id for that partition + primary_partition_gpu_id = self.helpers.get_primary_partition_gpu_id(device_handle) + # Add to the set if it's a non-primary partition and we found a valid primary GPU id + if partition_id != 0 and primary_partition_gpu_id is not None: + primary_partition_gpu_ids.add(primary_partition_gpu_id) + + if partition_warning_flag: + # Create a list of the primary partitions + primary_partitions_str = " ".join(f"GPU{gpu_id}" for gpu_id in primary_partition_gpu_ids) + + print("WARNING: CPER files are only available on primary partitions") + if len(primary_partition_gpu_ids) > 1: + print(f"Try with primary partitions {primary_partitions_str}",end="") + else: + print(f"Try with primary partition {primary_partitions_str}",end="") + + print() + + while True: + for idx, device_handle in enumerate(args.gpu): + self.helpers.ras_cper(args, device_handle, self.logger, idx) + if not args.follow: + break + time.sleep(1) + + + def node(self, args, multiple_devices=False, nodes=None, power_management=None): + """List node informations + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. + Defaults to False. + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if nodes: + args.nodes = nodes + if power_management: + args.power_management = power_management + if getattr(args, 'nodes', None) is None: + args.nodes = self.node_handle + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Get NPM info + if args.nodes is not None: + try: + npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info()) + npm_info = "N/A" + else: + logging.debug('No node handle available to query NPM info') + npm_info = "N/A" + + # Log outputs + npm_dict = {"limit": "N/A", "status": "N/A"} + power_unit ="W" + + limit = "N/A" + if isinstance(npm_info, dict): + limit = npm_info.get('limit', "N/A") + status = npm_info.get('status', npm_info.get('current', "N/A")) + + if limit !="N/A": + npm_dict['limit'] = limit + status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED" + npm_dict.update({"status": status}) + if self.logger.is_human_readable_format() and self.logger.destination == 'stdout': + print(f"NODE:\n POWER_MANAGEMENT:\n LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}\n STATUS: {npm_dict.get('status', 'N/A')}") + else: + if self.logger.is_csv_format(): + csv_dict = {} + csv_dict['limit'] = npm_dict.get('limit', "N/A") + csv_dict['status'] = npm_dict.get('status', "N/A") + self.logger.output = csv_dict + else: + # For JSON and human readable format with file output + npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit) + self.logger.output = {'node': {'power_management': npm_dict}} + if multiple_devices: + self.logger.store_multiple_device_output() + return + self.logger.print_output() + + + def default(self, args): + """Display the default amdsmi view when no args are given.""" + + # check groups first + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + processors = amdsmi_interface.amdsmi_get_processor_handles() + version_info = {"amd-smi": "N/A", + "amdgpu version": "N/A", + "fw pldm version": "N/A", + "vbios version": "N/A", + "rocm version": (False, "N/A")} + version_info['rocm version'] = amdsmi_interface.amdsmi_get_rocm_version() + try: + version_info["amdgpu version"] = amdsmi_interface.amdsmi_get_gpu_driver_info(processors[0]) + except amdsmi_exception.AmdSmiLibraryException as e: + version_info["amdgpu version"] = "N/A" + logging.debug("Failed to get driver info for gpu: %s", e.get_error_info()) + try: + fw_info = amdsmi_interface.amdsmi_get_fw_info(processors[0]) + for fw in fw_info['fw_list']: + if "pldm" in fw.keys(): + version_info['fw pldm version'] = fw['pldm'] + # we only need to find one of them + break + except amdsmi_exception.AmdSmiLibraryException as e: + version_info['fw pldm version'] = "N/A" + logging.debug("Failed to get fw pldm info for gpu: %s", e.get_error_info()) + try: + version_info['vbios version'] = amdsmi_interface.amdsmi_get_gpu_vbios_info(processors[0])["version"] + if version_info['vbios version'] == "": + version_info['vbios version'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + version_info['vbios version'] = "N/A" + logging.debug("Failed to get vbios info for gpu: %s", e.get_error_info()) + + version_info["amd-smi"] = f'{__version__}' + + default_table_info_dict = {} + default_table_info_dict.update({"version_info": version_info}) + + gpu_info_list = [] + all_process_list = [] + + # get info for each processor to display in default output + for processor in processors: + gpu_info_dict = {} + + gpu_id = self.helpers.get_gpu_id_from_device_handle(processor) + gpu_info_dict.update({"gpu_id": gpu_id}) + # get common gpu_metrics first + try: + gpu_metrics = amdsmi_interface.amdsmi_get_gpu_metrics_info(processor) + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_metrics = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() + + # partition info + try: + current_mem = amdsmi_interface.amdsmi_get_gpu_memory_partition(processor) + except amdsmi_exception.AmdSmiLibraryException as e: + current_mem = "N/A" + try: + current_comp = amdsmi_interface.amdsmi_get_gpu_compute_partition(processor) + except amdsmi_exception.AmdSmiLibraryException as e: + current_comp = "N/A" + if current_comp == "N/A" or current_mem == "N/A": + partition_mode = "N/A" + else: + partition_mode = f"{current_comp}/{current_mem}" + gpu_info_dict.update({"partition_mode": partition_mode}) + + # GPU name market name and OAM ID + try: + asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(processor) + market_name = asic_info['market_name'] + oam_id = asic_info['oam_id'] + # get num_cu now for use later + total_num_cu = float(asic_info['num_compute_units']) + except amdsmi_exception.AmdSmiLibraryException as e: + market_name = "N/A" + oam_id = "N/A" + total_num_cu = "N/A" + gpu_info_dict.update({"market_name": market_name}) + gpu_info_dict.update({"oam_id": oam_id}) + + # bdf + try: + bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(processor) + # if the len of the bdf is not 12, then invalid values are being populated. + if len(bdf) != 12: + bdf = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + bdf = "N/A" + gpu_info_dict.update({"bdf": bdf}) + + # HIP ID + try: + enum_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(processor) + hip_id = enum_info['hip_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + hip_id = "N/A" + gpu_info_dict.update({"hip_id": hip_id}) + + # mem utilization, GPU utilization, power usage, and temperature from gpu_metrics + if gpu_metrics != "N/A": + mem_util = gpu_metrics['average_umc_activity'] + gfx_util = gpu_metrics['average_gfx_activity'] + if gpu_metrics['current_socket_power'] != "N/A": + current_power = gpu_metrics['current_socket_power'] + else: + current_power = gpu_metrics['average_socket_power'] + temperature = gpu_metrics['temperature_hotspot'] + else: + mem_util = "N/A" + gfx_util = "N/A" + current_power = "N/A" + temperature = "N/A" + gpu_info_dict.update({"mem_util": mem_util}) + gpu_info_dict.update({"gfx_util": gfx_util}) + gpu_info_dict.update({"temp": temperature}) + + + # rest of power usage info; Will assume we're always trying to get PPT0 for now + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(processor, 0) + socket_power_limit = self.helpers.convert_SI_unit(power_cap_info['power_cap'], AMDSMIHelpers.SI_Unit.MICRO) + power_usage = {"current_power": current_power, "power_limit": socket_power_limit} + except amdsmi_exception.AmdSmiLibraryException as e: + power_usage = "N/A" + gpu_info_dict.update({"power_usage": power_usage}) + + # memory usage + try: + total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + mem_usage = {"used_vram": used_vram, "total_vram": total_vram} + except amdsmi_exception.AmdSmiLibraryException as e: + mem_usage = "N/A" + gpu_info_dict.update({"mem_usage": mem_usage}) + + # uncorrectable ECC errors + try: + ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(processor) + uncorrectable = ecc_count.pop('uncorrectable_count') + except amdsmi_exception.AmdSmiLibraryException as e: + uncorrectable = "N/A" + gpu_info_dict.update({"uncorr_ecc": uncorrectable}) + + # Fan usage + try: + fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(processor, 0) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get fan speed for gpu %s | %s", processor, e.get_error_info()) + fan_speed = "N/A" + try: + fan_max = amdsmi_interface.amdsmi_get_gpu_fan_speed_max(processor, 0) + fan_usage = "N/A" + if fan_max > 0 and fan_speed != "N/A": + fan_usage = round((float(fan_speed) / float(fan_max)) * 100, 2) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get max fan speed for gpu %s | %s", processor, e.get_error_info()) + fan_usage = "N/A" + gpu_info_dict.update({"fan": fan_usage}) + + gpu_info_list.append(gpu_info_dict) + + # Running Processes + try: + raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor) + for proc in raw_process_list: + proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"} + proc_info_dict['gpu'] = gpu_id + proc_info_dict['pid'] = proc['pid'] + proc_info_dict['name'] = proc['name'] + proc_info_dict['gtt'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['gtt_mem']) + proc_info_dict['vram'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['vram_mem']) + proc_info_dict['mem_usage'] = self.helpers.convert_bytes_to_readable(proc['mem']) + # Handle cu_occupancy conversion safely + try: + if proc['cu_occupancy'] != "N/A" and total_num_cu != "N/A": + num_cu = float(proc['cu_occupancy']) + proc_info_dict['cu_occupancy'] = {"current_cu": num_cu, "total_num_cu": total_num_cu} + else: + proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu} + except (ValueError, TypeError): + proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu} + + all_process_list.append(proc_info_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) + + default_table_info_dict.update({f"gpu_info_list": gpu_info_list}) + default_table_info_dict.update({"processes": all_process_list}) + + if self.logger.is_json_format(): + self.logger.output = default_table_info_dict + self.logger.print_output() + elif self.logger.is_csv_format(): + self.logger.multiple_device_output = default_table_info_dict + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + else: + self.logger.print_default_output(default_table_info_dict) + + + def _event_thread(self, commands, i): + devices = commands.device_handles + if len(devices) == 0: + print("No GPUs on machine") + return + + # Check that KFD permissions are available + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + device = devices[i] + listener = amdsmi_interface.AmdSmiEventReader(device, + amdsmi_interface.AmdSmiEvtNotificationType) + values_dict = {} + + while not self.stop: + try: + events = listener.read(2000) + for event in events: + values_dict["event"] = event["event"] + # parse message as it's own dictionary + message_list = event["message"].split(" ") + message_dict = {} + for item in message_list: + if not item == "": + item_list = item.split(": ") + message_dict.update({item_list[0]: item_list[1]}) + values_dict["message"] = message_dict + commands.logger.store_output(event['processor_handle'], 'values', values_dict) + commands.logger.print_output() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code != amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_DATA: + print(e) + except Exception as e: + print(e) + + listener.stop() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py new file mode 100755 index 0000000000..e30e433a57 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -0,0 +1,1934 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import grp +import json +import logging +import math +import multiprocessing +import os +import platform +import re +import sys +import time +import glob +import errno +import pwd +import stat +from typing import Tuple, Optional, Union + +from enum import Enum +from pathlib import Path +from typing import List, Set, Union +from functools import lru_cache + +# Import amdsmi library +from amdsmi_init import * +from BDF import BDF + + +class AMDSMIHelpers(): + """Helper functions that aren't apart of the AMDSMI API + Useful for determining platform and device identifiers + + Functions: + os_info: tuple () + """ + + def __init__(self) -> None: + self.operating_system = platform.system() + + self._is_hypervisor = False + self._is_virtual_os = False + self._is_baremetal = False + self._is_passthrough = False + + self._is_linux = False + self._is_windows = False + + # Counts and Tracking variables + self._count_of_sets_called = 0 + self._count_of_cper_files = 0 + self._previous_set_success_check = amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR + + + # Check if the system is a virtual OS + if self.operating_system.startswith("Linux"): + self._is_linux = True + logging.debug(f"AMDSMIHelpers: Platform is linux:{self._is_linux}") + + try: + with open('/proc/cpuinfo', 'r') as f: + if 'hypervisor' in f.read(): + self._is_virtual_os = True + except IOError: + pass + + self._is_baremetal = not self._is_virtual_os + + if self._is_virtual_os: + #If hard coded passthrough device ids exist on Virtual OS, + # then it is a passthrough system + output = self.get_pci_device_ids() + passthrough_device_ids = ["7460", "73c8", "74a0", "74a1", "74a2"] + if any(('0x' + device_id) in output for device_id in passthrough_device_ids): + self._is_baremetal = True + self._is_virtual_os = False + self._is_passthrough = True + + # Check for passthrough system dynamically via drm querying id_flags + try: + if self.is_amdgpu_initialized() and not self._is_passthrough: + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + for dev in device_handles: + virtualization_info = amdsmi_interface.amdsmi_get_gpu_virtualization_mode(dev) + if virtualization_info['mode'] == amdsmi_interface.AmdSmiVirtualizationMode.PASSTHROUGH: + self._is_baremetal = True + self._is_virtual_os = False + self._is_passthrough = True + break # Once passthrough is determined, we can immediately break + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Unable to determine virtualization status: " + str(e.get_error_code())) + + + def increment_set_count(self): + self._count_of_sets_called += 1 + + + def get_set_count(self): + return self._count_of_sets_called + + def assign_previous_set_success_check(self, status): + """Assigns the previous set success check to the status provided. + This is used to determine if the last set was successful or not. + """ + self._previous_set_success_check = status + + def get_previous_set_success_check(self): + """Returns the previous set success check. + This is used to determine if the last set was successful or not. + """ + return self._previous_set_success_check + + def increment_cper_count(self): + self._count_of_cper_files += 1 + + + def get_cper_count(self): + return self._count_of_cper_files + + + def is_virtual_os(self): + return self._is_virtual_os + + + def is_hypervisor(self): + # Returns True if hypervisor is enabled on the system + return self._is_hypervisor + + + def is_baremetal(self): + # Returns True if system is baremetal, if system is hypervisor this should return False + return self._is_baremetal + + + def is_passthrough(self): + return self._is_passthrough + + + def is_linux(self): + return self._is_linux + + + def is_windows(self): + return self._is_windows + + + def os_info(self, string_format=True): + """Return operating_system and type information ex. (Linux, Baremetal) + params: + string_format (bool) True to return in string format, False to return Tuple + returns: + str or (str, str) + """ + operating_system = "" + if self.is_linux(): + operating_system = "Linux" + elif self.is_windows(): + operating_system = "Windows" + else: + operating_system = "Unknown" + + operating_system_type = "" + if self.is_baremetal(): + operating_system_type = "Baremetal" + elif self.is_virtual_os(): + operating_system_type = "Guest" + elif self.is_hypervisor(): + operating_system_type = "Hypervisor" + else: + operating_system_type = "Unknown" + + # Passthrough Override + if self.is_passthrough(): + operating_system_type = "Guest (Passthrough)" + + if string_format: + return f"{operating_system} {operating_system_type}" + + return (operating_system, operating_system_type) + + + def get_amdsmi_init_flag(self): + return AMDSMI_INIT_FLAG + + + def is_amdgpu_initialized(self): + return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_GPUS + + + def is_amd_hsmp_initialized(self): + return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_CPUS + + + def get_rocm_version(self): + try: + rocm_lib_status, rocm_version = amdsmi_interface.amdsmi_get_rocm_version() + if rocm_lib_status is not True: + return "N/A" + return rocm_version + except amdsmi_interface.AmdSmiLibraryException as e: + return "N/A" + + + def get_cpu_choices(self): + """Return dictionary of possible CPU choices and string of the output: + Dictionary will be in format: cpus[ID]: Device Handle) + String output will be in format: + "ID: 0 " + params: + None + return: + (dict, str) : (cpu_choices, cpu_choices_str) + """ + cpu_choices = {} + cpu_choices_str = "" + + try: + cpu_handles = [] + # amdsmi_get_cpusocket_handles() returns the cpu socket handles stored for cpu_id + cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() + except amdsmi_interface.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.info('Unable to get device choices, driver not initialized (amd_hsmp or hsmp_acpi not found in modules)') + else: + raise e + if len(cpu_handles) == 0: + logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp or hsmp_acpi not found in modules)') + else: + # Handle spacing for the gpu_choices_str + max_padding = int(math.log10(len(cpu_handles))) + 1 + + for cpu_id, device_handle in enumerate(cpu_handles): + cpu_choices[str(cpu_id)] = { + "Device Handle": device_handle + } + if cpu_id == 0: + id_padding = max_padding + else: + id_padding = max_padding - int(math.log10(cpu_id)) + cpu_choices_str += f"ID: {cpu_id}\n" + + # Add the all option to the gpu_choices + cpu_choices["all"] = "all" + cpu_choices_str += f" all{' ' * max_padding}| Selects all devices\n" + + return (cpu_choices, cpu_choices_str) + + + def get_core_choices(self): + """Return dictionary of possible Core choices and string of the output: + Dictionary will be in format: coress[ID]: Device Handle) + String output will be in format: + "ID: 0 " + params: + None + return: + (dict, str) : (core_choices, core_choices_str) + """ + core_choices = {} + core_choices_str = "" + + try: + core_handles = [] + # amdsmi_get_cpucore_handles() returns the core handles stored for core_id + core_handles = amdsmi_interface.amdsmi_get_cpucore_handles() + except amdsmi_interface.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.info('Unable to get device choices, driver not initialized (amd_hsmp or hsmp_acpi not found in modules)') + else: + raise e + if len(core_handles) == 0: + logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp or hsmp_acpi not found in modules)') + else: + # Handle spacing for the gpu_choices_str + max_padding = int(math.log10(len(core_handles))) + 1 + + for core_id, device_handle in enumerate(core_handles): + core_choices[str(core_id)] = { + "Device Handle": device_handle + } + if core_id == 0: + id_padding = max_padding + else: + id_padding = max_padding - int(math.log10(core_id)) + core_choices_str += f"ID: 0 - {len(core_handles) - 1}\n" + + # Add the all option to the core_choices + core_choices["all"] = "all" + core_choices_str += f" all{' ' * max_padding}| Selects all devices\n" + + return (core_choices, core_choices_str) + + + def get_output_format(self): + """Returns the output format read from sys.argv + Returns: + str: outputformat + """ + args = sys.argv[1:] + outputformat = "human" + if "--json" in args or "--j" in args: + outputformat = "json" + elif "--csv" in args or "--c" in args: + outputformat = "csv" + return outputformat + + + def get_gpu_choices(self): + """Return dictionary of possible GPU choices and string of the output: + Dictionary will be in format: gpus[ID] : (BDF, UUID, Device Handle) + String output will be in format: + "ID: 0 | BDF: 0000:23:00.0 | UUID: ffffffff-0000-1000-0000-000000000000" + params: + None + return: + (dict, str) : (gpu_choices, gpu_choices_str) + """ + gpu_choices = {} + gpu_choices_str = "" + device_handles = [] + + try: + # amdsmi_get_processor_handles returns the device_handles storted for gpu_id + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + except amdsmi_interface.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.info('Unable to get device choices, driver not initialized (amdgpu not found in modules)') + else: + raise e + + if len(device_handles) == 0: + logging.info('Unable to find any devices, check if driver is initialized (amdgpu not found in modules)') + else: + # Handle spacing for the gpu_choices_str + max_padding = int(math.log10(len(device_handles))) + 1 + + for gpu_id, device_handle in enumerate(device_handles): + bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle) + uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(device_handle) + gpu_choices[str(gpu_id)] = { + "BDF": bdf, + "UUID": uuid, + "Device Handle": device_handle, + } + + if gpu_id == 0: + id_padding = max_padding + else: + id_padding = max_padding - int(math.log10(gpu_id)) + gpu_choices_str += f"ID: {gpu_id}{' ' * id_padding}| BDF: {bdf} | UUID: {uuid}\n" + + # Add the all option to the gpu_choices + gpu_choices["all"] = "all" + gpu_choices_str += f" all{' ' * max_padding}| Selects all devices\n" + + return (gpu_choices, gpu_choices_str) + + + @staticmethod + def is_UUID(uuid_question: str) -> bool: + """Determine if given string is of valid UUID format + Args: + uuid_question (str): the given string to be evaluated. + Returns: + True or False: wether the UUID given matches the UUID format. + """ + UUID_pattern = re.compile("^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$", flags=re.IGNORECASE) + if re.match(UUID_pattern, uuid_question) is None: + return False + return True + + + def get_device_handles_from_gpu_selections(self, gpu_selections: List[str], gpu_choices=None) -> tuple: + """Convert provided gpu_selections to device_handles + + Args: + gpu_selections (list[str]): Selected GPU ID(s), BDF(s), or UUID(s): + ex: ID:0 | BDF:0000:23:00.0 | UUID:ffffffff-0000-1000-0000-000000000000 + gpu_choices (dict{gpu_choices}): This is a dictionary of the possible gpu_choices + Returns: + (True, True, list[device_handles]): Returns a list of all the gpu_selections converted to + amdsmi device_handles + (False, valid_gpu_format, str): Return False, whether the format of the GPU input is valid, and the first input that failed to be converted + """ + if 'all' in gpu_selections: + return True, True, amdsmi_interface.amdsmi_get_processor_handles() + + if isinstance(gpu_selections, str): + gpu_selections = [gpu_selections] + + if gpu_choices is None: + # obtains dictionary of possible gpu choices + gpu_choices = self.get_gpu_choices()[0] + + selected_device_handles = [] + for gpu_selection in gpu_selections: + valid_gpu_choice = False + + for gpu_id, gpu_info in gpu_choices.items(): + bdf = gpu_info['BDF'] + is_bdf = True + uuid = gpu_info['UUID'] + device_handle = gpu_info['Device Handle'] + + # Check if passed gpu is a gpu ID or UUID + if gpu_selection == gpu_id or gpu_selection.lower() == uuid: + selected_device_handles.append(device_handle) + valid_gpu_choice = True + break + else: # Check if gpu passed is a BDF object + try: + if BDF(gpu_selection) == BDF(bdf): + selected_device_handles.append(device_handle) + valid_gpu_choice = True + break + except Exception: + is_bdf = False + pass + + if not valid_gpu_choice: + logging.debug(f"AMDSMIHelpers.get_device_handles_from_gpu_selections - Unable to convert {gpu_selection}") + valid_gpu_format = True + if not self.is_UUID(gpu_selection) and not gpu_selection.isdigit() and not is_bdf: + valid_gpu_format = False + return False, valid_gpu_format, gpu_selection + return True, True, selected_device_handles + + + def get_device_handles_from_cpu_selections(self, cpu_selections: List[str], cpu_choices=None): + """Convert provided cpu_selections to device_handles + + Args: + cpu_selections (list[str]): Selected CPU ID(s): + ex: ID:0 + cpu_choices (dict{cpu_choices}): This is a dictionary of the possible cpu_choices + Returns: + (True, list[device_handles]): Returns a list of all the cpu_selections converted to + amdsmi device_handles + (False, str): Return False, and the first input that failed to be converted + """ + if 'all' in cpu_selections: + return True, True, amdsmi_interface.amdsmi_get_cpusocket_handles() + + if isinstance(cpu_selections, str): + cpu_selections = [cpu_selections] + + if cpu_choices is None: + cpu_choices = self.get_cpu_choices()[0] + + selected_device_handles = [] + for cpu_selection in cpu_selections: + valid_cpu_choice = False + for cpu_id, cpu_info in cpu_choices.items(): + device_handle = cpu_info['Device Handle'] + + # Check if passed gpu is a gpu ID + if cpu_selection == cpu_id: + selected_device_handles.append(device_handle) + valid_cpu_choice = True + break + if not valid_cpu_choice: + logging.debug(f"AMDSMIHelpers.get_device_handles_from_cpu_selections - Unable to convert {cpu_selection}") + valid_cpu_format = True + if not cpu_selection.isdigit(): + valid_cpu_format = False + return False, valid_cpu_format, cpu_selection + return True, True, selected_device_handles + + + def get_device_handles_from_core_selections(self, core_selections: List[str], core_choices=None): + """Convert provided core_selections to device_handles + + Args: + core_selections (list[str]): Selected CORE ID(s): + ex: ID:0 + core_choices (dict{core_choices}): This is a dictionary of the possible core_choices + Returns: + (True, list[device_handles]): Returns a list of all the core_selections converted to + amdsmi device_handles + (False, str): Return False, and the first input that failed to be converted + """ + if 'all' in core_selections: + return True, True, amdsmi_interface.amdsmi_get_cpucore_handles() + + if isinstance(core_selections, str): + core_selections = [core_selections] + + if core_choices is None: + core_choices = self.get_core_choices()[0] + + selected_device_handles = [] + for core_selection in core_selections: + valid_core_choice = False + for core_id, core_info in core_choices.items(): + device_handle = core_info['Device Handle'] + + # Check if passed core is a core ID + if core_selection == core_id: + selected_device_handles.append(device_handle) + valid_core_choice = True + break + if not valid_core_choice: + logging.debug(f"AMDSMIHelpers.get_device_handles_from_core_selections - Unable to convert {core_selection}") + valid_core_format = True + if not core_selection.isdigit(): + valid_core_format = False + return False, valid_core_format, core_selection + return True, True, selected_device_handles + + + def handle_gpus(self, args, logger, subcommand): + """This function will run execute the subcommands based on the number + of gpus passed in via args. + params: + args - argparser args to pass to subcommand + current_platform_args (list) - GPU supported platform arguments + current_platform_values (list) - GPU supported values for the arguments + logger (AMDSMILogger) - Logger to print out output + subcommand (AMDSMICommands) - Function that can handle multiple gpus + + return: + tuple(bool, device_handle) : + bool - True if executed subcommand for multiple devices + device_handle - Return the device_handle if the list of devices is a length of 1 + (handled_multiple_gpus, device_handle) + + """ + if isinstance(args.gpu, list): + if len(args.gpu) > 1: + for device_handle in args.gpu: + # Handle multiple_devices to print all output at once + subcommand(args, multiple_devices=True, gpu=device_handle) + logger.print_output(multiple_device_enabled=True) + return True, args.gpu + elif len(args.gpu) == 1: + args.gpu = args.gpu[0] + return False, args.gpu + else: + logging.debug("args.gpu has an empty list") + else: + return False, args.gpu + + + def handle_cpus(self, args, logger, subcommand): + """This function will run execute the subcommands based on the number + of cpus passed in via args. + params: + args - argparser args to pass to subcommand + logger (AMDSMILogger) - Logger to print out output + subcommand (AMDSMICommands) - Function that can handle multiple gpus + + return: + tuple(bool, device_handle) : + bool - True if executed subcommand for multiple devices + device_handle - Return the device_handle if the list of devices is a length of 1 + (handled_multiple_gpus, device_handle) + + """ + if isinstance(args.cpu, list): + if len(args.cpu) > 1: + for device_handle in args.cpu: + # Handle multiple_devices to print all output at once + subcommand(args, multiple_devices=True, cpu=device_handle) + logger.print_output(multiple_device_enabled=True) + return True, args.cpu + elif len(args.cpu) == 1: + args.cpu = args.cpu[0] + return False, args.cpu + else: + logging.debug("args.cpu has empty list") + else: + return False, args.cpu + + + def handle_cores(self, args, logger, subcommand): + """This function will run execute the subcommands based on the number + of cores passed in via args. + params: + args - argparser args to pass to subcommand + logger (AMDSMILogger) - Logger to print out output + subcommand (AMDSMICommands) - Function that can handle multiple gpus + + return: + tuple(bool, device_handle) : + bool - True if executed subcommand for multiple devices + device_handle - Return the device_handle if the list of devices is a length of 1 + (handled_multiple_gpus, device_handle) + + """ + if isinstance(args.core, list): + if len(args.core) > 1: + for device_handle in args.core: + # Handle multiple_devices to print all output at once + subcommand(args, multiple_devices=True, core=device_handle) + logger.print_output(multiple_device_enabled=True) + return True, args.core + elif len(args.core) == 1: + args.core = args.core[0] + return False, args.core + else: + logging.debug("args.core has empty list") + else: + return False, args.core + + + # The below handle_nodes function is currently unused as only node 0 is supported. + # Marked as a private function until it is needed in the future. + def _handle_nodes(self, args, logger, subcommand): + """This function will run execute the subcommands based on the number + of nodes passed in via args. + params: + args - argparser args to pass to subcommand + current_platform_args (list) - GPU supported platform arguments + current_platform_values (list) - GPU supported values for the arguments + logger (AMDSMILogger) - Logger to print out output + subcommand (AMDSMICommands) - Function that can handle multiple gpus + + return: + tuple(bool, device_handle) : + bool - True if executed subcommand for multiple devices + device_handle - Return the device_handle if the list of devices is a length of 1 + (handled_multiple_nodes, device_handle) + + """ + if isinstance(args.node, list): + if len(args.node) > 1: + for node_handle in args.node: + # Handle multiple_devices to print all output at once + subcommand(args, multiple_devices=True, node=node_handle) + logger.print_output(multiple_device_enabled=True) + return True, args.node + elif len(args.node) == 1: + args.node = args.node[0] + return False, args.node + else: + logging.debug("args.node has an empty list") + else: + return False, args.node + + + def handle_watch(self, args, subcommand, logger): + """This function will run the subcommand multiple times based + on the passed watch, watch_time, and iterations passed in. + params: + args - argparser args to pass to subcommand + subcommand (AMDSMICommands) - Function that can handle + watching output (Currently: metric & process) + logger (AMDSMILogger) - Logger for accessing config values + return: + Nothing + """ + # Set the values for watching as the args will cleared + watch = args.watch + watch_time = args.watch_time + iterations = args.iterations + + # Set the args values to None so we don't loop recursively + args.watch = None + args.watch_time = None + args.iterations = None + + # Set the signal handler to flush a delmiter to file if the format is json + print("'CTRL' + 'C' to stop watching output:") + if watch_time: # Run for set amount of time + iterations_ran = 0 + end_time = time.time() + watch_time + while time.time() <= end_time: + subcommand(args, watching_output=True) + # Handle iterations limit + iterations_ran += 1 + if iterations is not None: + if iterations <= iterations_ran: + break + time.sleep(watch) + elif iterations is not None: # Run for a set amount of iterations + for iteration in range(iterations): + subcommand(args, watching_output=True) + if iteration == iterations - 1: # Break on iteration completion + break + time.sleep(watch) + else: # Run indefinitely as watch_time and iterations are not set + while True: + subcommand(args, watching_output=True) + time.sleep(watch) + + return 1 + + + def get_gpu_id_from_device_handle(self, input_device_handle): + """Get the gpu index from the device_handle. + amdsmi_get_processor_handles() returns the list of device_handles in order of gpu_index + """ + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + for gpu_index, device_handle in enumerate(device_handles): + if input_device_handle.value == device_handle.value: + return gpu_index + raise amdsmi_exception.AmdSmiParameterException(input_device_handle, + amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle, + "Unable to find gpu ID from device_handle") + + + def get_cpu_id_from_device_handle(self, input_device_handle): + """Get the cpu index from the device_handle. + amdsmi_interface.amdsmi_get_cpusocket_handles() returns the list of device_handles in order of cpu_index + """ + device_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() + for cpu_index, device_handle in enumerate(device_handles): + if input_device_handle.value == device_handle.value: + return cpu_index + raise amdsmi_exception.AmdSmiParameterException(input_device_handle, + amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle, + "Unable to find cpu ID from device_handle") + + + def get_core_id_from_device_handle(self, input_device_handle): + """Get the core index from the device_handle. + amdsmi_interface.amdsmi_get_cpusocket_handles() returns the list of device_handles in order of cpu_index + """ + device_handles = amdsmi_interface.amdsmi_get_cpucore_handles() + for core_index, device_handle in enumerate(device_handles): + if input_device_handle.value == device_handle.value: + return core_index + raise amdsmi_exception.AmdSmiParameterException(input_device_handle, + amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle, + "Unable to find core ID from device_handle") + + + def get_amd_gpu_bdfs(self): + """Return a list of GPU BDFs visibile to amdsmi + + Returns: + list[BDF]: List of GPU BDFs + """ + gpu_bdfs = [] + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + + for device_handle in device_handles: + bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle) + gpu_bdfs.append(bdf) + + return gpu_bdfs + + + def is_amd_device(self, device_handle): + """ Return whether the specified device is an AMD device or not + + param device: DRM device identifier + """ + # Get card vendor id + asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(device_handle) + try: + vendor_value = int(asic_info['vendor_id'], 16) + return vendor_value == AMD_VENDOR_ID + except: + return False + + + def get_perf_levels(self): + perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel] + perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel)) + return perf_levels_str, perf_levels_int + + + def get_accelerator_partition_profile_config(self): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + accelerator_partition_profiles = {'profile_indices':[], 'profile_types':[], 'memory_caps': []} + for dev in device_handles: + try: + profile = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(dev) + num_profiles = profile['num_profiles'] + for p in range(num_profiles): + accelerator_partition_profiles['profile_indices'].append(str(profile['profiles'][p]['profile_index'])) + accelerator_partition_profiles['profile_types'].append(profile['profiles'][p]['profile_type']) + accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps']) + break # Only need to get the profiles for one device + except amdsmi_interface.AmdSmiLibraryException as e: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unable to get accelerator partition profile config for device {dev}: {str(e)}") + if e.err_code == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Device {dev} does not support accelerator partition profiles") + return accelerator_partition_profiles + break + except Exception as e: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unexpected error occurred --> Unable to get accelerator partition profile config for device {dev}: {str(e)}") + break + return accelerator_partition_profiles + + + def get_accelerator_choices_types_indices(self): + return_val = ("N/A", {'profile_indices':[], 'profile_types':[]}) + if os.geteuid() != 0: + logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Not root, unable to get accelerator partition profiles") + # If not root, we can't get the accelerator partition profiles + return return_val + else: + logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Root, getting accelerator partition profiles") + accelerator_partition_profiles = self.get_accelerator_partition_profile_config() + if len(accelerator_partition_profiles['profile_types']) != 0: + compute_partitions_list = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices'] + return_val = (compute_partitions_list, accelerator_partition_profiles) + return return_val + + + def get_memory_partition_types(self): + memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] + if 'UNKNOWN' in memory_partitions_str: + memory_partitions_str.remove('UNKNOWN') + return memory_partitions_str + + + def get_clock_types(self): + clock_types_str = [clock.name for clock in amdsmi_interface.AmdSmiClkType] + clock_types_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiClkType)) + return clock_types_str, clock_types_int + + + def get_power_profiles(self): + power_profiles_str = [profile.name for profile in amdsmi_interface.AmdSmiPowerProfilePresetMasks] + if 'UNKNOWN' in power_profiles_str: + power_profiles_str.remove('UNKNOWN') + return power_profiles_str + + + def get_perf_det_levels(self): + perf_det_level_str = [level.name for level in amdsmi_interface.AmdSmiDevPerfLevel] + if 'UNKNOWN' in perf_det_level_str: + perf_det_level_str.remove('UNKNOWN') + return perf_det_level_str + + + def get_power_caps(self): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + power_limit_types = { + 'ppt0': { + 'power_cap_min': amdsmi_interface.MaxUIntegerTypes.UINT64_T, + 'power_cap_max': 0 + }, + 'ppt1': { + 'power_cap_min': amdsmi_interface.MaxUIntegerTypes.UINT64_T, + 'power_cap_max': 0 + } + } + + for dev in device_handles: + try: + power_cap_types = amdsmi_interface.amdsmi_get_supported_power_cap(dev) + for sensor in power_cap_types['sensor_inds']: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(dev, sensor) + if power_cap_info['max_power_cap'] > power_limit_types[f'ppt{sensor}']['power_cap_max']: + power_limit_types[f'ppt{sensor}']['power_cap_max'] = power_cap_info['max_power_cap'] + if power_cap_info['min_power_cap'] < power_limit_types[f'ppt{sensor}']['power_cap_min']: + power_limit_types[f'ppt{sensor}']['power_cap_min'] = power_cap_info['min_power_cap'] + except (amdsmi_interface.AmdSmiLibraryException, KeyError) as e: + logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}") + continue + + # If we never found a real min or max, set them to N/A + for ppt_key in ['ppt0', 'ppt1']: + if power_limit_types[ppt_key]['power_cap_min'] == amdsmi_interface.MaxUIntegerTypes.UINT64_T: + power_limit_types[ppt_key]['power_cap_min'] = "N/A" + if power_limit_types[ppt_key]['power_cap_max'] == 0: + power_limit_types[ppt_key]['power_cap_max'] = "N/A" + + ppt0_power_cap_max = self.format_power_cap(power_limit_types['ppt0']['power_cap_max']) + ppt0_power_cap_min = self.format_power_cap(power_limit_types['ppt0']['power_cap_min']) + ppt1_power_cap_max = self.format_power_cap(power_limit_types['ppt1']['power_cap_max']) + ppt1_power_cap_min = self.format_power_cap(power_limit_types['ppt1']['power_cap_min']) + + return (ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max) + + + def format_power_cap(self, value): + if value != "N/A": + converted = self.convert_SI_unit(value, AMDSMIHelpers.SI_Unit.MICRO) + return f"{converted} W" + return value + + + def get_soc_pstates(self): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + soc_pstate_profile_list = [] + for dev in device_handles: + try: + soc_pstate_info = amdsmi_interface.amdsmi_get_soc_pstate(dev) + # Check if 'policies' key exists before accessing it + if 'policies' in soc_pstate_info and soc_pstate_info['policies']: + for policy in soc_pstate_info['policies']: + policy_string = f"{policy['policy_id']}: {policy['policy_description']}" + if not policy_string in soc_pstate_profile_list: + soc_pstate_profile_list.append(policy_string) + except amdsmi_interface.AmdSmiLibraryException as e: + continue + except KeyError as e: + logging.debug(f"AMDSMIHelpers.get_soc_pstates - Missing key in soc_pstate_info: {e}") + continue + if len(soc_pstate_profile_list) == 0: + soc_pstate_profile_list.append("N/A") + return soc_pstate_profile_list + + + def get_xgmi_plpd_policies(self): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + xgmi_plpd_profile_list = [] + for dev in device_handles: + try: + xgmi_plpd_info = amdsmi_interface.amdsmi_get_xgmi_plpd(dev) + # Check if 'policies' key exists before accessing it + if 'policies' in xgmi_plpd_info and xgmi_plpd_info['policies']: + for policy in xgmi_plpd_info['policies']: + policy_string = f"{policy['policy_id']}: {policy['policy_description']}" + if not policy_string in xgmi_plpd_profile_list: + xgmi_plpd_profile_list.append(policy_string) + except amdsmi_interface.AmdSmiLibraryException as e: + continue + except KeyError as e: + logging.debug(f"AMDSMIHelpers.get_xgmi_plpd_policies - Missing key in xgmi_plpd_info: {e}") + continue + if len(xgmi_plpd_profile_list) == 0: + xgmi_plpd_profile_list.append("N/A") + return xgmi_plpd_profile_list + + + def validate_clock_type(self, input_clock_type): + valid_clock_types_str, valid_clock_types_int = self.get_clock_types() + + valid_clock_input = False + if isinstance(input_clock_type, str): + for clock_type in valid_clock_types_str: + if input_clock_type.lower() == clock_type.lower(): + input_clock_type = clock_type # Set input_clock_type to enum value in AmdSmiClkType + valid_clock_input = True + break + elif isinstance(input_clock_type, int): + if input_clock_type in valid_clock_types_int: + input_clock_type = amdsmi_interface.AmdSmiClkType(input_clock_type) + valid_clock_input = True + + return valid_clock_input, input_clock_type + + + def confirm_out_of_spec_warning(self, auto_respond=False): + """ Print the warning for running outside of specification and prompt user to accept the terms. + + @param auto_respond: Response to automatically provide for all prompts + """ + print(''' + ******WARNING******\n + Operating your AMD GPU outside of official AMD specifications or outside of + factory settings, including but not limited to the conducting of overclocking, + over-volting or under-volting (including use of this interface software, + even if such software has been directly or indirectly provided by AMD or otherwise + affiliated in any way with AMD), may cause damage to your AMD GPU, system components + and/or result in system failure, as well as cause other problems. + DAMAGES CAUSED BY USE OF YOUR AMD GPU OUTSIDE OF OFFICIAL AMD SPECIFICATIONS OR + OUTSIDE OF FACTORY SETTINGS ARE NOT COVERED UNDER ANY AMD PRODUCT WARRANTY AND + MAY NOT BE COVERED BY YOUR BOARD OR SYSTEM MANUFACTURER'S WARRANTY. + Please use this utility with caution. + ''') + if not auto_respond: + user_input = input('Do you accept these terms? [y/n] ') + else: + user_input = auto_respond + if user_input in ['y', 'Y', 'yes', 'Yes', 'YES']: + return + else: + sys.exit('Confirmation not given. Exiting without setting value') + + + def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=False): + """ Print the warning for running outside of specification and prompt user to accept the terms. + + :param autoRespond: Response to automatically provide for all prompts + """ + + print(''' + ******WARNING******\n + After changing memory (NPS) partition modes, users MUST restart + (reload) the AMD GPU driver. This command NO LONGER AUTOMATICALLY + reloads the driver, see `amd-smi reset -h` and + `sudo amd-smi reset -r` for more information. + + This change is intended to allow users the ability to control when is + the best time to restart the AMD GPU driver, as it may not be desired + to restart the AMD GPU driver immediately after changing the + memory (NPS) partition mode. + + Please use `sudo amd-smi reset -r` AFTER successfully + changing the memory (NPS) partition mode. A successful driver reload + is REQUIRED in order to complete updating ALL GPUs in the hive to + the requested partition mode. + + ******REMINDER****** + In order to reload the AMD GPU driver, users MUST quit all GPU + workloads across all devices. + ''') + + if not auto_respond: + user_input = input('Do you accept these terms? [Y/N] ') + else: + user_input = auto_respond + if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']: + print('') + return + else: + print('Confirmation not given. Exiting without setting value') + sys.exit(1) + + def confirm_gpu_driver_reload_warning(self, auto_respond=False): + """ Print the warning for running outside of specification and prompt user to accept the terms. + + :param autoRespond: Response to automatically provide for all prompts + """ + print(''' + ****** WARNING ******\n + AMD SMI is about to initiate an AMD GPU driver restart (module reload). + + Reloading the AMD GPU driver REQUIRES users to quit all GPU activity across all + devices. + + If user is initiating a driver reload AFTER changing memory (NPS) partition + modes (`sudo amd-smi set -M `), a AMD GPU driver reload is REQUIRED + to complete updating the partition mode. This change will effect ALL GPUs in + the hive. Advise using `amd-smi list -e` and `amd-smi partition -c -m` + afterwards to ensure changes were applied as expected. + + Please use this utility with caution. + ''') + if not auto_respond: + user_input = input('Do you accept these terms? [Y/N] ') + else: + user_input = auto_respond + if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']: + print('') + return + else: + print('Confirmation not given. Exiting without setting value') + sys.exit(1) + + def is_valid_profile(self, profile): + profile_presets = amdsmi_interface.amdsmi_wrapper.amdsmi_power_profile_preset_masks_t__enumvalues + if profile in profile_presets: + return True, profile_presets[profile] + else: + return False, profile_presets.values() + + + def convert_bytes_to_readable(self, bytes_input, format_length=None): + if isinstance(bytes_input, str): + return "N/A" + for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]: + if abs(bytes_input) < 1024: + if format_length is not None: + if bytes_input < 10: + return f"{bytes_input:4.3f} {unit}" + elif bytes_input < 100: + return f"{bytes_input:4.2f} {unit}" + elif bytes_input < 1000: + return f"{bytes_input:4.1f} {unit}" + else: + return f"{bytes_input:4.0f} {unit}" + else: + return f"{bytes_input:3.1f} {unit}" + bytes_input /= 1024 + return f"{bytes_input:.1f} YB" + + + def unit_format(self, logger, value, unit): + """This function will format output with unit based on the logger output format + + params: + logger (AMDSMILogger) - Logger to print out output + value - the value to be formatted + unit - the unit to be formatted with the value + return: + str or dict : formatted output + """ + if isinstance(value, list): + formatted_values = [] + for val in value: + if isinstance(val, str) and val == "N/A": + formatted_values.append("N/A") + else: + formatted_values.append(self.unit_format(logger, val, unit)) + return formatted_values + else: + if value == "N/A": + return "N/A" + if logger.is_json_format(): + if unit: + return {"value": value, "unit": unit} + else: + return value + if logger.is_csv_format(): + # For CSV, return the raw value (number or "N/A"), not a string + return value + if logger.is_human_readable_format(): + if unit: + return f"{value} {unit}".rstrip() + else: + return f"{value}".rstrip() + return f"{value}" + + def unit_unformat(self, logger, formatted_value): + """ + This function will unformat output with unit based on the logger output format + params: + logger (AMDSMILogger) - Logger to print out output + formatted_value - the value to be unformatted + return: + str or dict : unformatted output + """ + if logger.is_json_format(): + if isinstance(formatted_value, dict): + return formatted_value['value'] + return formatted_value + if logger.is_human_readable_format(): + return formatted_value.split()[0] + return formatted_value + + + class SI_Unit(float, Enum): + GIGA = 1000000000 # 10^9 + MEGA = 1000000 # 10^6 + KILO = 1000 # 10^3 + HECTO = 100 # 10^2 + DEKA = 10 # 10^1 + BASE = 1 # 10^0 + DECI = 0.1 # 10^-1 + CENTI = 0.01 # 10^-2 + MILLI = 0.001 # 10^-3 + MICRO = 0.000001 # 10^-6 + NANO = 0.000000001 # 10^-9 + + + def convert_SI_unit(self, val: Union[int, float], unit_in: SI_Unit, unit_out = SI_Unit.BASE) -> Union[int, float]: + """This function will convert a value into another + scientific (SI) unit. Defaults unit_out to SI_Unit.BASE + + params: + val: int or float unit to convert + unit_in: Requires using SI_Unit to set current value's SI unit (eg. SI_Unit.MICRO) + unit_out - Requires using SI_Unit to set current value's SI unit + default value is SI_Unit.BASE (eg. SI_Unit.MICRO) + return: + int or float : converted SI unit of value requested + """ + if isinstance(val, float): + return val * unit_in / unit_out + elif isinstance(val, int): + return int(float(val) * unit_in / unit_out) + else: + raise TypeError("val must be an int or float") + + + def get_pci_device_ids(self) -> Set[str]: + pci_devices_path = "/sys/bus/pci/devices" + pci_devices: set[str] = set() + for device in os.listdir(pci_devices_path): + device_path = os.path.join(pci_devices_path, device, "device") + try: + with open(device_path, 'r') as f: + device = f.read().strip() + pci_devices.add(device) + except Exception as _: + continue + return pci_devices + + + def progressbar(self, it, prefix="", size=60, out=sys.stdout, add_newline=False): + count = len(it) + if (add_newline): + print("{}\n".format(prefix),end='\r', file=out, flush=False) + else: + print("{}".format(prefix),end='\r', file=out, flush=False) + def show(j): + x = int(size*j/count) + print("[{}{}] {}/{} secs remain".format(u"█"*x, "."*(size-x), j, count), + end='\r', file=out, flush=True) + show(0) + for i, item in enumerate(it): + yield item + show(i+1) + print("\n\n", end='\r', flush=True, file=out) + + + def showProgressbar(self, title="", timeInSeconds=13, add_newline=False): + if title != "": + title += " " + for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline): + time.sleep(1) + + @lru_cache(maxsize=128) + def _cached_group_name(self, gid: int) -> str: + try: + return grp.getgrgid(gid).gr_name + except Exception: + # In containers, the UID may not resolve to a name + return str(gid) + + @lru_cache(maxsize=128) + def _cached_user_name(self, uid: int) -> str: + try: + return pwd.getpwuid(uid).pw_name + except Exception: + # In containers, the GID may not resolve to a name + return str(uid) + + # Attempt to grab file info + def _stat_info(self, path: str) -> dict: + try: + st = os.stat(path) + return { + "uid": st.st_uid, + "gid": st.st_gid, + "user": self._cached_user_name(st.st_uid), + "group": self._cached_group_name(st.st_gid), + } + except Exception as e: + return {"error": str(e)} + + def _has_read_access(self, path: str) -> Tuple[bool, Optional[int], Optional[str]]: + """ + Check whether the current (real/effective) user can read the given path + without opening it. Returns (ok:bool, errno_or_None, message_or_None) + """ + try: + st = os.stat(path) + except OSError as e: + return False, e.errno, e.strerror + + # root can always read + if os.geteuid() == 0: + return True, None, None + + mode = st.st_mode + uid = st.st_uid + gid = st.st_gid + + euid = os.geteuid() + egid = os.getegid() + groups = os.getgroups() + + # owner + if euid == uid: + if mode & stat.S_IRUSR: + return True, None, None + return False, errno.EACCES, "Permission denied (owner)" + + # group + if gid == egid or gid in groups: + if mode & stat.S_IRGRP: + return True, None, None + return False, errno.EACCES, "Permission denied (group)" + + # other + if mode & stat.S_IROTH: + return True, None, None + + return False, errno.EACCES, "Permission denied (other)" + + def check_required_groups(self, check_render=True, check_video=True): + """ + Check if the current user can access kfd and dri + Specifically, only care for EACCES/EPERM + + Args: + check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True. + check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True. + + Returns: + bool: True if all checked devices are accessible, False if any permission errors found + """ + + # Skip check if running as root. + if os.geteuid() == 0: + return True + + paths_to_check = [] + + # Only add paths for device types that are flagged for checking + if check_render and os.path.exists("/dev/kfd"): + paths_to_check.append("/dev/kfd") + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))] + + # Video group corresponds to /dev/dri/card* + if check_video: + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))] + + if not paths_to_check: + return True + + denied = [] + + for path in paths_to_check: + # Do not try to open all paths, may cause driver issues. + # Read access is sufficient to check permissions. + # + # Reason: GPUs which support partitioning (memory/compute), + # logical devices will not be valid until configured. + # See `sudo amd-smi set -h` or applicable APIs + # to configure on supported hardware. + # + # Example error dmesg output: + # [965358.883112] amdgpu 0000:15:00.0: amdgpu: renderD153 partition 1 not valid! + # [965358.883283] amdgpu 0000:15:00.0: amdgpu: renderD154 partition 2 not valid! + # [965358.883438] amdgpu 0000:15:00.0: amdgpu: renderD155 partition 3 not valid! + # [965358.883594] amdgpu 0000:15:00.0: amdgpu: renderD156 partition 4 not valid! + # [965358.883749] amdgpu 0000:15:00.0: amdgpu: renderD157 partition 5 not valid! + # [965358.883904] amdgpu 0000:15:00.0: amdgpu: renderD158 partition 6 not valid! + # [965358.884060] amdgpu 0000:15:00.0: amdgpu: renderD159 partition 7 not valid! + ok, err, msg = self._has_read_access(path) + if ok: + continue + # if permission denied or operation not permitted + if err in (errno.EACCES, errno.EPERM): + denied.append((path, err, msg, self._stat_info(path))) + + if denied: + # Collect unique group info from denied devices + required_groups = {"kfd": [], "renderD": [], "card": []} + device_types = {"kfd": [], "renderD": [], "card": []} + + for path, err, msg, si in denied: + if "error" not in si: + # Categorize devices and collect unique group info + if "/dev/kfd" in path: + device_types["kfd"].append(path) + required_groups["kfd"].append(si) + elif "/dev/dri/renderD" in path: + device_types["renderD"].append(path) + required_groups["renderD"].append(si) + elif "/dev/dri/card" in path: + device_types["card"].append(path) + required_groups["card"].append(si) + + # Deduplicate group info by converting to tuple for hashing + for device_type in required_groups: + unique_groups = list(dict.fromkeys( + tuple(sorted(d.items())) for d in required_groups[device_type] + )) + required_groups[device_type] = [dict(item) for item in unique_groups] + + lines = [] + lines.append("Permission needed to access required GPU device node(s):") + + # Collect all unique groups for usermod command + all_groups = set() + + # Show summary of denied devices by type with ownership info + if device_types["kfd"]: + lines.append(" • /dev/kfd: Permission denied") + if len(required_groups["kfd"]) > 1: + lines.append(" - Required group(s):") + else: + lines.append(" - Required group:") + for group_info in required_groups["kfd"]: + lines.append( + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], + ) + ) + all_groups.add(group_info["group"]) + + if device_types["renderD"]: + lines.append(f" • /dev/dri/renderD*: {len(device_types['renderD'])} device(s) denied") + if len(required_groups["renderD"]) > 1: + lines.append(" - Required group(s):") + else: + lines.append(" - Required group:") + for group_info in required_groups["renderD"]: + lines.append( + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], + ) + ) + all_groups.add(group_info["group"]) + + if device_types["card"]: + lines.append(f" • /dev/dri/card*: {len(device_types['card'])} device(s) denied") + if len(required_groups["card"]) > 1: + lines.append(" - Required group(s):") + else: + lines.append(" - Required group:") + for group_info in required_groups["card"]: + lines.append( + " - User: {user} (UID={uid}) | Group: {group} (GID={gid})".format( + user=group_info["user"], + uid=group_info["uid"], + group=group_info["group"], + gid=group_info["gid"], + ) + ) + all_groups.add(group_info["group"]) + + # Generate usermod command with all unique groups + groups_for_usermod = ",".join(sorted(all_groups)) + + lines.extend([ + "", + "To resolve this issue, try the following:", + " • Add your user to the required group(s):", + f" sudo usermod -aG {groups_for_usermod} \"$USER\"", + " • Log out and log back in for the group changes to take effect", + " • Alternatively, run this command with sudo/admin privileges", + "" + ]) + print("\n".join(lines)) + return False + + return True + + def _severity_as_string(self, error_severity, notify_type, for_filename): + if error_severity == "non_fatal_uncorrected": + if(for_filename): + return "uncorrected" + return "NONFATAL-UNCORRECTED" + elif error_severity == "non_fatal_corrected": + if(for_filename): + return "corrected" + return "NONFATAL-CORRECTED" + elif error_severity == "fatal": + if notify_type == "BOOT": + if(for_filename): + return "boot" + return "BOOT" + if(for_filename): + return "fatal" + return "FATAL" + if(for_filename): + return "unknown" + return "UNKNOWN" + + def display_cper_files_generated(self, entries, device_handle, folder): + # One‐time initialization: print warning & header only once + if not getattr(self, "_cper_display_initialized", False): + # Warning if no folder was specified elsewhere + if not getattr(self, "_cper_warning_printed", False): + print(f"WARNING: No CPER files will be dumped unless --folder= is specified and cper entries exist.") + self._cper_warning_printed = True + + self._print_header(folder) + self._cper_display_initialized = True + + # Loop through all entries in the dictionary. + for entry_index, entry in enumerate(entries.values()): + # Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type". + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + prefix = self._severity_as_string(entry.get("error_severity", "Unknown"), + entry.get("notify_type", "Unknown"), + False) + output = f"{timestamp:<20} {gpu_id:<7} {prefix:<20}" + if folder: + prefix = self._severity_as_string(entry.get("error_severity", "Unknown"), + entry.get("notify_type", "Unknown"), + True) + cper_data_file = f"{prefix}_{self.get_cper_count() + 1}.cper" + afids = self.pvtDumpAfids(cper_data_file) + afids_str = ' '.join(map(str, afids)) + output += f" {cper_data_file:<17} {afids_str}" + + print(output) + self.increment_cper_count() + + def _print_header(self, folder): + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<20}", end="") + if folder: + print(f" {'file_name':<17} {'list of afids'}", end="") + print("") + + def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None): + """ + Dump CPER entries to files in the specified folder. Handles batch deletion if file limit is exceeded. + + Parameters: + folder (str): Path to the folder where CPER files will be dumped. + entries (dict): Dictionary containing CPER entry metadata. + cper_data (list): List of CPER data objects with 'bytes' and 'size' keys. + device_handle: Device handle for GPU identification. + file_limit (int, optional): Maximum number of files to retain in the folder. + """ + # Initialize header display + if not getattr(self, "_cper_display_initialized", False): + self._print_header(folder) + self._cper_display_initialized = True + + if folder: + folder = Path(folder) + folder.mkdir(parents=True, exist_ok=True) + + output_rows = {} + + for entry_index, entry in enumerate(entries.values()): + # Determine prefix/severity + error_severity = entry.get("error_severity", "").lower() + notify_type = entry.get("notify_type", "") + prefix = self._severity_as_string(error_severity, notify_type, True) + + # Generate filenames + count = self.get_cper_count() + 1 + cper_name = f"{prefix}-{count}.cper" + json_name = f"{prefix}-{count}.json" + cper_path = folder / cper_name + json_path = folder / json_name + + # Write CPER binary file + try: + self.write_binary( + cper_data[entry_index]["bytes"], + cper_data[entry_index]["size"], + cper_path + ) + except Exception as e: + logging.debug(f"Failed to write CPER file {cper_path}: {e}") + + # Write JSON metadata file + try: + with json_path.open("w") as cper_json_file: + json.dump( + obj=entry, + fp=cper_json_file, + indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o + ) + except Exception as e: + logging.debug(f"Failed to write JSON file {json_path}: {e}") + + # Collect data for printing + timestamp = entry.get("timestamp", "unknown") + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + severity = self._severity_as_string(error_severity, notify_type, False) + output_rows[cper_path] = [timestamp, gpu_id, severity, cper_name] + self.increment_cper_count() + + # Batch deletion if file limit is exceeded (AFTER writing ALL new files) + if file_limit: + folder_files = list(sorted(folder.glob("*.cper"), key=lambda p: p.stat().st_mtime)) + if len(folder_files) > file_limit: + files_to_delete = len(folder_files) - file_limit + for old_file in folder_files[:files_to_delete]: + try: + old_file.unlink() + json_file = old_file.with_suffix('.json') + if json_file.exists(): + json_file.unlink() + except OSError as e: + logging.debug(f"Failed to delete file {old_file}: {e}") + + # Print collected rows + for cper_path, row in output_rows.items(): + timestamp, gpu_id, severity, fname = row + try: + afids = self.pvtDumpAfids(cper_path) + afids_str = ' '.join(map(str, afids)) + except Exception as e: + afids_str = "Error fetching AFIDs" + logging.debug(f"Failed to fetch AFIDs for {cper_path}: {e}") + print(f"{timestamp:<20} {gpu_id:<7} {severity:<20} {fname:<17} {afids_str}") + + else: + # Print entries as JSON if no folder is specified + try: + print(json.dumps( + entries, + indent=2, + default=lambda o: o.decode('utf-8') if isinstance(o, bytes) else o + )) + except Exception as e: + logging.debug(f"Failed to dump entries as JSON: {e}") + + def write_binary(self, data, size, filepath): + """ + Writes binary data directly to a file. + + Parameters: + data: Either a bytes object or a list of integers representing binary data. + size (int): The number of bytes to write. + filepath: The path to the output file. + """ + with open(filepath, 'wb') as f: + if isinstance(data, list): + try: + # Attempt to convert the list to a bytes object. + data_bytes = bytes(data[:size]) + except ValueError: + # If any value is out of range, force them into 0-255. + data_bytes = bytes(x % 256 for x in data[:size]) + else: + data_bytes = data[:size] + f.write(data_bytes) + + def binary_to_hexdump_string(self, data: Union[bytes, List[int]]) -> str: + """ + Convert binary data to a hexdump string. + + Args: + data: bytes object or list of integer byte values (0–255). + + Returns: + A multiline string, each line showing: + offset (in hex), hex bytes (16 per line), and printable ASCII. + """ + if isinstance(data, bytes): + data_ints = list(data) + else: + # Allow list of ints or single-character strings + data_ints = [] + for b in data: + if isinstance(b, int): + data_ints.append(b) + elif isinstance(b, str) and len(b) == 1: + data_ints.append(ord(b)) + else: + raise ValueError(f"Invalid type in data: {type(b)}") + + lines: List[str] = [] + size = len(data_ints) + + for offset in range(0, size, 16): + chunk = data_ints[offset : offset + 16] + hex_values = " ".join(f"{b:02x}" for b in chunk) + # Pad hex_values to 16*3-1 = 47 chars (two hex digits + space) + hex_values = hex_values.ljust(16 * 3 - 1) + ascii_values = "".join(chr(b) if 32 <= b <= 126 else "." for b in chunk) + lines.append(f"{offset:08x} {hex_values} |{ascii_values}|") + + return "\n".join(lines) + + def pvtDumpAfids(self, cper_file): + # 1) Fetch the CPER “file” and ensure we have raw bytes + raw_data = cper_file + if hasattr(raw_data, "read"): + # fetch_cper_file returned a file‐object + raw = raw_data.read() + elif isinstance(raw_data, Path): + # Path: read the bytes directly + raw = raw_data.read_bytes() + elif isinstance(raw_data, str): + # fetch_cper_file returned a filename + with open(raw_data, "rb") as f: + raw = f.read() + else: + # assume it's already bytes + raw = raw_data + self.binary_to_hexdump_string(raw) + try: + afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) + return afids + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: + raise ValueError("Invalid CPER file inputs") from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_SIZE: + raise ValueError("Invalid CPER file data size") from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_UNEXPECTED_DATA: + raise ValueError("Unexpected data in CPER file") from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + raise NotImplementedError("AFID decoding not supported") from e + else: + raise ValueError("Unexpected Error getting afids from CPER file") from e + + def get_partition_id(self, device_handle, gpu_id = None) -> int: + partition_id = -1 + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(device_handle) + partition_id = kfd_info['current_partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + return partition_id + + def get_primary_partition_gpu_id(self, device_handle) -> Union[int, None]: + try: + bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(device_handle) + if bdf is None: + logging.debug("Failed to get device BDF: BDF is None") + return None + # Construct primary partition BDF (base + ".0" for function 0) + primary_bdf = bdf[:10] + ".0" + try: + primary_device_handle = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(primary_bdf) + partition_id = self.get_partition_id(primary_device_handle) + if partition_id == 0: + return self.get_gpu_id_from_device_handle(primary_device_handle) + return None + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get primary partition device handle with BDF %s: %s", primary_bdf, e.get_error_info()) + return None + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get partition device BDF: %s", e.get_error_info()) + return None + + def is_primary_partition(self, device_handle, gpu_id = None) -> bool: + partition_id = self.get_partition_id(device_handle, gpu_id) + if partition_id != 0: + logging.debug(f"Skipping gpu {gpu_id} on non zero partition {partition_id}") + return False + return True + + def ras_cper(self, args, device_handle, logger, gpu_idx): + # Parse severity mask dynamically from the --severity option. + severity_mask = 0 + # drop duplicates of args + logging.debug(args) + + for sev in list(set(args.severity)): + if sev == "all": + # Set bits for NON_FATAL_UNCORRECTED (0), FATAL (1), and NON_FATAL_CORRECTED (2) + severity_mask |= ((1 << 0) | (1 << 1) | (1 << 2)) + elif sev == "fatal": + # Set bit corresponding to AMDSMI_CPER_SEV_FATAL (which is 1) + severity_mask |= (1 << 1) + elif sev in ("nonfatal", "nonfatal-uncorrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_UNCORRECTED (which is 0) + severity_mask |= (1 << 0) + elif sev in ("nonfatal-corrected", "corrected"): + # Set bit corresponding to AMDSMI_CPER_SEV_NON_FATAL_CORRECTED (which is 2) + severity_mask |= (1 << 2) + + buffer_size = 1048576 + + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + if args.follow and not getattr(self, "_cper_follow_prompted", False): + print("Press CTRL + C to stop.") + self._cper_follow_prompted = True + + primary_partition = self.is_primary_partition(device_handle, gpu_id) + if not primary_partition: + return + + if args.folder and not getattr(self, "_cper_folder_prompted", False): + self._cper_folder_prompted = True + + logger.set_cper_exit_message(False) + self.stop = False + + num_entries = 0 + while True: + try: + entries, new_cursor, cper_data, status_code = amdsmi_interface.amdsmi_get_gpu_cper_entries( + device_handle, severity_mask, buffer_size, args.cursor[gpu_idx]) + logging.debug(f"cper_entries | entries: {entries}") + num_entries = num_entries + len(entries) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Error opening CPER file. This command requires elevation') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED or \ + e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_NOT_FOUND: + raise FileNotFoundError('Error accessing CPER files. This command requires CPER to be enabled.') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_FILE_ERROR: + raise FileExistsError('Error opening CPER file. Unable to read CPER File') from e + else: + logging.debug(f"Cannot retrieve CPER entries: {e}") + break + args.cursor[gpu_idx] = new_cursor + if len(entries) == 0: + break + if args.folder: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + self.display_cper_files_generated(entries, device_handle, args.folder) + if num_entries == 0 and not args.follow: + if args.folder: + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + else: + self.display_cper_files_generated(entries, device_handle, args.folder) + + def get_bitmask_ranges(self, bitmask_dict): + ranges = {} + #start index of the first bitmask + current_start = 0 + + for cpu, bitmask in bitmask_dict.items(): + # Convert the bitmask to a binary string + binary_str = bin(int(bitmask, 16))[2:].zfill(64) + + binary_str = binary_str[::-1] + start = 0 + end = len(binary_str) - 1 + # Find the range of set bits + start_b = binary_str.find('1') + end_b = binary_str.rfind('1') + + start_setbit = start_b + current_start + end_setbit = end_b + current_start + + # Calculate the actual bit positions + end_bit = current_start + end + + # Update the start index for the next bitmask + current_start = end_bit + 1 + + # Store the range in the dictionary + if start_b == -1 and end_b == -1: + ranges[cpu] = "N/A" + else: + ranges[cpu] = f"{start_setbit}-{end_setbit}" + + return ranges + + def build_xcp_dict(self, key, violation_status, num_partition): + if not isinstance(violation_status[key], list): + if "active_" in key: + if violation_status[key] != "N/A": + if violation_status[key] is True: + violation_status[key] = "ACTIVE" + elif violation_status[key] is False: + violation_status[key] = "NOT ACTIVE" + ret = violation_status[key] + elif isinstance(violation_status[key], list): + for row in violation_status[key]: + for element in row: + if element != "N/A": + if "active_" in key: + if element is True: + row[row.index(element)] = "ACTIVE" + elif element is False: + row[row.index(element)] = "NOT ACTIVE" + elif ("per_" in key) or ("acc_" in key): + row[row.index(element)] = element + else: + continue + ret = {f"xcp_{i}": violation_status[key][i] for i in range(num_partition)} + return ret + + @staticmethod + def average_flattened_ints(data, context="data"): + """Calculate the average of flattened integers from a list or tuple + Args: + data (list or tuple): Data to calculate the average from + context (str, optional): Context for logging. Defaults to "data". + Returns: + float or str: Average of integers if available, otherwise "N/A" + """ + # Type validation - ensure data is list or tuple + # Note: Data can be nested list of lists and will filter out N/A values + if not isinstance(data, (list, tuple)): + logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}") + return "N/A" + + # Flatten nested lists and filter integers + flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)] + return round(sum(flat) / len(flat)) if flat else "N/A" + + def _get_metric_version_and_partition_info(self, gpu_metrics_info, is_partition_metrics, gpu_id, gpu_handle): + """ + Helper method to compute metric version, partition ID, and num_partition for dynamic metrics. + Handles logging updates internally for reusability. + + Args: + gpu_metrics_info (dict): GPU metrics info from amdsmi_get_gpu_metrics_info. + is_partition_metrics (bool): Whether this is for partition metrics. + gpu_id (int): GPU ID for logging. + gpu_handle: GPU device handle for KFD info retrieval. + + Returns: + dict: { + 'metric_version': float or "N/A", + 'partition_id': int or "N/A", + 'num_partition': int or "N/A", + 'num_xcp': int or "N/A" # Alias for num_partition + } + """ + # Compute metric version from header revisions + metric_version = "N/A" + format_rev = gpu_metrics_info.get('common_header.format_revision', "N/A") + content_rev = gpu_metrics_info.get('common_header.content_revision', "N/A") + if format_rev != "N/A" and content_rev != "N/A": + try: + metric_version = float(f"{format_rev}.{content_rev}") + except ValueError: + metric_version = "N/A" # Fallback if conversion fails + + # Retrieve partition ID from KFD info + partition_id = "N/A" + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu_handle) + partition_id = kfd_info.get('current_partition_id', "N/A") + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current partition ID for GPU %s | %s", gpu_id, e.get_error_info()) + + # Determine num_partition with fallback logic for dynamic metrics + num_partition = gpu_metrics_info.get('num_partition', "N/A") + if metric_version != "N/A" and num_partition == "N/A": + # Workaround: Default to 1 for newer metric versions if num_partition is missing + # (Confirmed with driver team; applies to GPU and partition metrics) + if not is_partition_metrics and metric_version >= 1.9: + num_partition = 1 + elif is_partition_metrics and metric_version >= 1.1: + num_partition = 1 + elif partition_id != "N/A" and partition_id > 0: + # Fallback to partition_id if partitions exist but num_partition is unavailable + num_partition = partition_id + # Else: Remains "N/A" if no conditions match + + # Alias num_xcp for XCP metrics usage + num_xcp = num_partition + + # Debug logging + logging.debug( + "GPU %s | Metric version: %s, num_partition: %s, partition_id: %s, num_xcp: %s", + gpu_id, metric_version, num_partition, partition_id, num_xcp + ) + + return { + 'metric_version': metric_version, + 'partition_id': partition_id, + 'num_partition': num_partition, + 'num_xcp': num_xcp + } diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_init.py b/projects/amdsmi/amdsmi_cli/amdsmi_init.py new file mode 100644 index 0000000000..7271dae941 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_init.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +### Handle safe initialization for amdsmi + +import atexit +import logging +import signal +import sys +import os + +from pathlib import Path + +current_path = os.path.dirname(os.path.abspath(__file__)) +python_lib_path = f"{current_path}/../../share/amd_smi" +sys.path.append(python_lib_path) +# If the python library is installed, it will overwrite the path above + +try: + from amdsmi import amdsmi_interface, amdsmi_exception +except ImportError as e: + print(f"Unhandled import error: {e}") + print("Failed to import the amdsmi Python library. Ensure it is installed in Python.") + print(f"Alternatively, verify that the library is in the path:\n{python_lib_path}") + sys.exit(1) + +# Using basic python logging for user errors and development +logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) # User level logging +# This traceback limit only affects this file, once the code hit's the cli portion it get's reset to the user's preference +sys.tracebacklimit = -1 # Disable traceback when raising errors + +# On initial import set initialized variable +AMDSMI_INITIALIZED = False +AMDSMI_INIT_FLAG = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS +AMD_VENDOR_ID = 4098 + +def check_amdgpu_driver(): + """ Returns true if amdgpu is found in the list of initialized modules """ + amd_gpu_status_file = Path("/sys/module/amdgpu/initstate") + if amd_gpu_status_file.exists(): + if amd_gpu_status_file.read_text(encoding="ascii").strip() == "live": + return True + return False + + +def check_amd_hsmp_driver(): + """ Returns true if amd_hsmp or hsmp_acpi is found in the list of initialized modules """ + amd_cpu_status_file = Path("/dev/hsmp") + if amd_cpu_status_file.exists(): + return True + return False + + +def amdsmi_cli_init(): + """ Initializes AMDSMI Library for the CLI + + Checks for the presence of the amdgpu, amd_hsmp or hsmp_acpi drivers and initializes the + AMD SMI library based on the live drivers found. + + Return: + init_flag: the flag used to initialize the AMD SMI library without error + + Raises: + err: AmdSmiLibraryException if not successful in initializing any drivers + """ + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS + if check_amdgpu_driver() and check_amd_hsmp_driver(): + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS + logging.debug("Both amdgpu , amd_hsmp or hsmp_acpi driver's initstate is live") + try: + amdsmi_interface.amdsmi_init(init_flag) + except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error("Drivers not loaded (amdgpu, amd_hsmp or hsmp_acpi drivers not found in modules)") + sys.exit(-1) + else: + raise e + elif check_amdgpu_driver(): + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS + logging.debug("amdgpu driver initstate is live") + try: + amdsmi_interface.amdsmi_init(init_flag) + except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error("Driver not loaded (amdgpu not found in modules)") + sys.exit(-1) + else: + raise e + logging.debug("amdgpu driver initialized successfully, but amd_hsmp or hsmp_acpi initstate was not live") + elif check_amd_hsmp_driver(): + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS + logging.debug("amd_hsmp or hsmp_acpi driver initstate is live") + try: + amdsmi_interface.amdsmi_init(init_flag) + except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error("Driver not loaded (amd_hsmp or hsmp_acpi not found in modules)") + sys.exit(-1) + else: + raise e + logging.debug("amd_hsmp or hsmp_acpi driver initialized successfully, but amdgpu initstate was not live") + + logging.debug(f"AMDSMI initialized with atleast one driver successfully | init flag: {init_flag}") + + return init_flag + +def amdsmi_cli_shutdown(): + """Shutdown AMDSMI instance + + Raises: + err: AmdSmiLibraryException if not successful + """ + try: + amdsmi_interface.amdsmi_shut_down() + except amdsmi_exception.AmdSmiLibraryException as e: + logging.error("Unable to cleanly shut down amd-smi-lib") + raise e + + +def signal_handler(sig, frame): + logging.debug(f"Handling signal: {sig}") + try: + sys.exit(0) + except Exception as e: + logging.error("Unable to cleanly shut down amd-smi-lib, exception: %s", str(type(e).__name__)) + os._exit(0) + +if not AMDSMI_INITIALIZED: + AMDSMI_INIT_FLAG = amdsmi_cli_init() + AMDSMI_INITIALIZED = True + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + atexit.register(amdsmi_cli_shutdown) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py new file mode 100644 index 0000000000..eaddb81e77 --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -0,0 +1,1140 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import csv +import json +import re +import time +from typing import Dict +from enum import Enum +from amdsmi_helpers import AMDSMIHelpers + +class AMDSMILogger(): + def __init__(self, format='human_readable', destination='stdout', helpers=None) -> None: + self.output = {} + self.multiple_device_output = [] + self.watch_output = [] + self.format = format # csv, json, or human_readable + self.destination = destination # stdout, path to a file (append) + self.table_title = "" + self.table_header = "" + self.secondary_table_title = "" + self.secondary_table_header = "" + self.warning_message = "" + if helpers is None: + # If helpers is not provided, create a new instance + self.helpers = AMDSMIHelpers() + else: + self.helpers = helpers + self._cper_exit_message = True + self.store_cpu_json_output = [] + self.store_core_json_output = [] + self.store_gpu_json_output = [] + self.store_xgmi_metric_json_output = [] + self.store_xgmi_source_status_json_output = [] + self.store_xgmi_link_status_json_output = [] + self.store_current_partition_json_output = [] + self.store_memory_partition_json_output = [] + self.store_partition_profiles_json_output = [] + self.store_partition_resources_json_output = [] + + + class LoggerFormat(Enum): + """Enum for logger formats""" + json = 'json' + csv = 'csv' + human_readable = 'human_readable' + + + class CsvStdoutBuilder(object): + def __init__(self): + self.csv_string = [] + + def write(self, row): + self.csv_string.append(row) + + def __str__(self): + return ''.join(self.csv_string) + + + def is_json_format(self): + return self.format == self.LoggerFormat.json.value + + + def is_csv_format(self): + return self.format == self.LoggerFormat.csv.value + + + def is_human_readable_format(self): + return self.format == self.LoggerFormat.human_readable.value + + + def clear_multiple_devices_output(self): + self.multiple_device_output.clear() + + + def get_cper_exit_message(self): + """ Get the cper exit message + params: + None + return: + cper_exit_message (bool) - True if cper exit message is set + """ + return self._cper_exit_message + + + def set_cper_exit_message(self, flag:bool): + """ Set the cper exit message + params: + flag (bool) - True if cper exit message is set + return: + Nothing + """ + self._cper_exit_message = flag + + + def _capitalize_keys(self, input_dict): + output_dict = {} + for key in input_dict.keys(): + # Capitalize key if it is a string + if isinstance(key, str): + cap_key = key.upper() + else: + cap_key = key + + if isinstance(input_dict[key], dict): + output_dict[cap_key] = self._capitalize_keys(input_dict[key]) + elif isinstance(input_dict[key], list): + cap_key_list = [] + for data in input_dict[key]: + if isinstance(data, dict): + cap_key_list.append(self._capitalize_keys(data)) + else: + cap_key_list.append(data) + output_dict[cap_key] = cap_key_list + else: + output_dict[cap_key] = input_dict[key] + + return output_dict + + + def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False): + # TODO make dynamic - convert other python CLI outputs to use (as needed) + # Update: using dynamic=true provides dynamic re-sizing based on key name length + + table_values = '' + stored_gpu = '' + stored_timestamp = '' + for key, value in json_object.items(): + string_value = str(value) + if key == 'partition_id': + # Special case for partition_id: 8 partitions + 7 comma + 2 spaces = 17 + table_values += string_value.ljust(17) + continue + key_length = len(key) + 2 + if dynamic and len(key) > 0: + stored_gpu = string_value + table_values += string_value.ljust(key_length) + elif key == 'gpu': + stored_gpu = string_value + table_values += string_value.rjust(3) + elif key == 'xcp': + stored_gpu = string_value + table_values += string_value.rjust(5) + elif key == 'timestamp': + stored_timestamp = string_value + table_values += string_value.rjust(10) + ' ' + elif key == 'power_usage': + table_values += string_value.rjust(7) + elif key == 'max_power': + table_values += string_value.rjust(9) + elif key in ('hotspot_temperature', 'memory_temperature'): + table_values += string_value.rjust(8) + elif key in ('gfx', 'mem'): + table_values += string_value.rjust(7) + elif key in ('gfx_clk'): + table_values += string_value.rjust(10) + elif key in ('vram_usage'): + table_values += string_value.rjust(16) + elif key in ('mem_clock', 'vram_used'): + table_values += string_value.rjust(11) + elif key in ('vram_total', 'vram_free'): + table_values += string_value.rjust(12) + elif key == 'vram_percent': + table_values += string_value.rjust(9) + elif key in ('encoder', 'decoder'): + table_values += string_value.rjust(7) + elif key in ('vclock', 'dclock'): + table_values += string_value.rjust(10) + elif key in ('single_bit_ecc', 'double_bit_ecc', 'pcie_bw'): + table_values += string_value.rjust(12) + elif key in ('pcie_replay'): + table_values += string_value.rjust(13) + # Only for handling topology tables + elif 'gpu_' in key: + table_values += string_value.ljust(13) + # Only for handling xgmi tables + elif key == "gpu#": + table_values += string_value.ljust(7) + elif key == "bdf": + table_values += string_value.ljust(14) + elif "bdf_" in key: + table_values += string_value.ljust(13) + elif key == "bit_rate": + table_values += string_value.ljust(10) + elif key == "max_bandwidth": + table_values += string_value.ljust(15) + elif key == "link_type": + table_values += string_value.ljust(11) + elif key == "link_status": + for i in value: + table_values += str(i).ljust(3) + elif key == "RW": + table_values += string_value.ljust(57) + elif key in ('pviol', 'tviol'): + table_values += string_value.rjust(7) + elif key == "tviol_active": + table_values += string_value.rjust(14) + elif key == "phot_tviol": + table_values += string_value.rjust(12) + elif key == "vr_tviol": + table_values += string_value.rjust(10) + elif key == "hbm_tviol": + table_values += string_value.rjust(11) + elif key == "gfx_clkviol": + table_values += string_value.rjust(13) + elif key in ("gfxclk_pviol", "gfxclk_tviol", "gfxclk_totalviol", "low_utilviol"): + table_values += string_value.rjust(58) + elif key == "process_list": + #Add an additional padding between the first instance of GPU and NAME + table_values += ' ' + for process_dict in value: + if process_dict['process_info'] == "No running processes detected": + # Add N/A for empty process_info + table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \ + "N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \ + "N/A".rjust(9) + "N/A".rjust(10) + '\n' + else: + #Fix this herre + for process_key, process_value in process_dict['process_info'].items(): + string_process_value = str(process_value) + if process_key == "name": + # Truncate name if too long + if string_process_value == "" or string_process_value == "N/A": + process_name = "N/A" + else: + process_name = string_process_value.split('/')[-1][:17] + table_values += process_name.rjust(17) + elif process_key == "pid": + table_values += string_process_value.rjust(9) + elif process_key == "memory_usage": + for memory_key, memory_value in process_value.items(): + table_values += str(memory_value).rjust(10) + elif process_key == "mem_usage": + table_values += string_process_value.rjust(10) + elif process_key == "cu_occupancy": + table_values += string_process_value.rjust(9) + elif process_key == "evicted_time": + table_values += string_process_value.rjust(9) + # Add the stored gpu and stored timestamp to the next line + table_values += '\n' + if stored_timestamp: + table_values += stored_timestamp.ljust(10) + ' ' + table_values += stored_gpu.rjust(3) + ' ' + + # Remove excess two values after a new line in table_values + table_values = table_values[:table_values.rfind('\n')] + table_values += '\n' + # Default spacing + else: + table_values += string_value.rjust(10) + return table_values.rstrip() + + + def _convert_json_to_human_readable(self, json_object: Dict[str, any]): + # First Capitalize all keys in the json object + capitalized_json = self._capitalize_keys(json_object) + + # Increase tabbing for device arguments by pulling them out of the main dictionary and assiging them to an empty string + tabbed_dictionary = {} + for key, value in capitalized_json.items(): + if key not in ["GPU", "CPU", "CORE"]: + tabbed_dictionary[key] = value + # Filter out N/A values under clock + if key == "CLOCK": + valid_clock_data = {} + if isinstance(value, dict): # Ensure value is a dictionary + for clock_key, clock_data in value.items(): + if isinstance(clock_data, dict): # Ensure clock_data is a dictionary + non_na = { + clock_key: clock_value + for clock_key, clock_value in clock_data.items() + if clock_value != "N/A" + } + if non_na: + valid_clock_data[clock_key] = non_na + else: # Handle single-tier clock_data + valid_clock_data[clock_key] = clock_data + else: # Handle non-dictionary clock data + valid_clock_data = value + # Add a single "N/A" if valid_clock_data is empty + if not valid_clock_data: + valid_clock_data = "N/A" + tabbed_dictionary[key] = valid_clock_data + + for key, value in tabbed_dictionary.items(): + del capitalized_json[key] + + capitalized_json["AMDSMI_SPACING_REMOVAL"] = tabbed_dictionary + + # Convert the capitalized JSON to a YAML-like string + yaml_output = self.custom_dump(capitalized_json) + + # Remove a key line if it is a spacer + yaml_output = yaml_output.replace("AMDSMI_SPACING_REMOVAL:\n", "") + yaml_output = yaml_output.replace("'", "") # Remove '' + + # Remove process_info indicies for Host parity: + yaml_output = re.sub(r'PROCESS_INFO_[0-9]+:', 'PROCESS_INFO:', yaml_output) + + clean_yaml_output = '' + for line in yaml_output.splitlines(): + line = line.split(':') + + # Remove dashes and increase tabbing split key + line[0] = line[0].replace("-", " ", 1) + line[0] = line[0].replace(" ", " ") + + # Join cleaned output + line = ':'.join(line) + '\n' + clean_yaml_output += line + + return clean_yaml_output + + def custom_dump(self, data, indent=0): + """Converts a Python dictionary to a YAML-like string.""" + yaml_string = "" + for key, value in data.items(): + if isinstance(value, dict): + yaml_string += " " * indent + f"{key}:\n" + self.custom_dump(value, indent + 1) + elif isinstance(value, list): + if not value: + yaml_string += " " * indent + f"{key}: N/A\n" + else: + yaml_string += " " * indent + f"{key}:\n" + for item in value: + if isinstance(item, dict): + yaml_string += self.custom_dump(item, indent + 1) + else: # If the list is not a dictionary, print it as a string + yaml_string += " " * (indent + 1) + f"- {item}\n" + else: + yaml_string += " " * indent + f"{key}: {value}\n" + return yaml_string + + def flatten_dict(self, target_dict, topology_override=False): + """This will flatten a dictionary out to a single level of key value stores + removing key's with dictionaries and wrapping each value to in a list + ex: + { + 'usage': { + 'gfx_usage': 0, + 'mem_usage': 0, + 'mm_usage_list': [22,0,0] + } + } + to: + { + 'gfx_usage': 0, + 'mem_usage': 0, + 'mm_usage_list': [22,0,0]} + } + + Args: + target_dict (dict): Dictionary to flatten + """ + output_dict = {} + # First flatten out values + + # separetly handle ras and process and firmware + + # If there are multi values, and the values are all dicts + # Then flatten the sub values with parent key + for key, value in target_dict.items(): + if isinstance(value, dict): + # Check number of items in the dict + if len(value.values()) > 1 or topology_override: + value_with_parent_key = {} + for parent_key, child_dict in value.items(): + if isinstance(child_dict, dict): + if parent_key in ('gfx'): + for child_key, value1 in child_dict.items(): + value_with_parent_key[child_key] = value1 + else: + for child_key, value1 in child_dict.items(): + value_with_parent_key[parent_key + '_' + child_key] = value1 + else: + if topology_override: + value_with_parent_key[key + '_' + parent_key] = child_dict + else: + value_with_parent_key[parent_key] = child_dict + value = value_with_parent_key + + output_dict.update(self.flatten_dict(value).items()) + else: + output_dict[key] = value + return output_dict + + + def store_output(self, device_handle, argument, data): + """ Convert device handle to gpu id and store output + params: + device_handle - device handle object to the target device output + argument (str) - key to store data + data (dict | list) - Data store against argument + return: + Nothing + """ + gpu_id = self.helpers.get_gpu_id_from_device_handle(device_handle) + self._store_output_amdsmi(gpu_id=gpu_id, argument=argument, data=data) + + + def store_cpu_output(self, device_handle, argument, data): + """ Convert device handle to cpu id and store output + params: + device_handle - device handle object to the target device output + argument (str) - key to store data + data (dict | list) - Data store against argument + return: + Nothing + """ + cpu_id = self.helpers.get_cpu_id_from_device_handle(device_handle) + self._store_cpu_output_amdsmi(cpu_id=cpu_id, argument=argument, data=data) + + + def store_core_output(self, device_handle, argument, data): + """ Convert device handle to core id and store output + params: + device_handle - device handle object to the target device output + argument (str) - key to store data + data (dict | list) - Data store against argument + return: + Nothing + """ + core_id = self.helpers.get_core_id_from_device_handle(device_handle) + self._store_core_output_amdsmi(core_id=core_id, argument=argument, data=data) + + + def _store_core_output_amdsmi(self, core_id, argument, data): + if argument == 'timestamp': # Make sure timestamp is the first element in the output + self.output['timestamp'] = int(time.time()) + + if self.is_json_format() or self.is_human_readable_format(): + self.output['core'] = int(core_id) + if argument == 'values' and isinstance(data, dict): + self.output.update(data) + else: + self.output[argument] = data + elif self.is_csv_format(): + self.output['core'] = int(core_id) + + if argument == 'values' or isinstance(data, dict): + flat_dict = self.flatten_dict(data) + self.output.update(flat_dict) + else: + self.output[argument] = data + else: + raise ValueError("Invalid output format: expected json, csv, or human_readable") + + + def _store_cpu_output_amdsmi(self, cpu_id, argument, data): + if argument == 'timestamp': # Make sure timestamp is the first element in the output + self.output['timestamp'] = int(time.time()) + + if self.is_json_format() or self.is_human_readable_format(): + self.output['cpu'] = int(cpu_id) + if argument == 'values' and isinstance(data, dict): + self.output.update(data) + else: + self.output[argument] = data + elif self.is_csv_format(): + self.output['cpu'] = int(cpu_id) + + if argument == 'values' or isinstance(data, dict): + flat_dict = self.flatten_dict(data) + self.output.update(flat_dict) + else: + self.output[argument] = data + else: + raise ValueError("Invalid output format: expected json, csv, or human_readable") + + + def _store_output_amdsmi(self, gpu_id, argument, data): + if argument == 'timestamp': # Make sure timestamp is the first element in the output + self.output['timestamp'] = int(time.time()) + + if self.is_json_format() or self.is_human_readable_format(): + self.output['gpu'] = int(gpu_id) + if argument == 'values' and isinstance(data, dict): + self.output.update(data) + else: + self.output[argument] = data + elif self.is_csv_format(): + self.output['gpu'] = int(gpu_id) + + if argument == 'values' or isinstance(data, dict): + flat_dict = self.flatten_dict(data) + self.output.update(flat_dict) + else: + self.output[argument] = data + else: + raise ValueError("Invalid output format: expected json, csv, or human_readable") + + + def store_multiple_device_output(self): + """ Store the current output into the multiple_device_output + then clear the current output + params: + None + return: + Nothing + """ + if not self.output: + return + output = {} + for key, value in self.output.items(): + output[key] = value + + self.multiple_device_output.append(output) + self.output = {} + + + def store_watch_output(self, multiple_device_enabled=False): + """ Add the current output or multiple_devices_output + params: + multiple_device_enabled (bool) - True if watching multiple devices + return: + Nothing + """ + if multiple_device_enabled: + for output in self.multiple_device_output: + self.watch_output.append(output) + + self.multiple_device_output = [] + else: + output = {} + + for key, value in self.output.items(): + output[key] = value + self.watch_output.append(output) + + self.output = {} + + + def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False, dynamic=False): + """ Print current output acording to format and then destination + params: + multiple_device_enabled (bool) - True if printing output from + multiple devices + watching_output (bool) - True if printing watch output + dynamic (bool) - Defaults to False. True turns on dynamic resizing for + left justified table output + return: + Nothing + """ + if self.is_json_format(): + self._print_json_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) + elif self.is_csv_format(): + if dual_csv_output: + self._print_dual_csv_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) + else: + self._print_csv_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) + elif self.is_human_readable_format(): + # If tabular output is enabled, redirect to _print_tabular_output + if tabular: + self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output, dynamic=dynamic) + else: + self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) + + + def _print_json_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: + json_output = self.multiple_device_output + else: + json_output = [self.output] + + if self.destination == 'stdout': + if json_output: + json_std_output = json.dumps(json_output, indent=4) + print(json_std_output) + else: # Write output to file + if watching_output: # Flush the full JSON output to the file on watch command completion + with self.destination.open('w', encoding="utf-8") as output_file: + json.dump(self.watch_output, output_file, indent=4) + else: + with self.destination.open('a', encoding="utf-8") as output_file: + json.dump(json_output, output_file, indent=4) + + + def combine_arrays_to_json(self): + combined_json = {} + if self.store_cpu_json_output: + combined_json["cpu_data"] = self.store_cpu_json_output + if self.store_core_json_output: + combined_json["core_data"] = self.store_core_json_output + if self.store_gpu_json_output: + combined_json["gpu_data"] = self.store_gpu_json_output + if self.store_xgmi_metric_json_output: + combined_json["xgmi_metric"] = self.store_xgmi_metric_json_output + if self.store_xgmi_source_status_json_output: + combined_json["link_port_status"] = self.store_xgmi_source_status_json_output + if self.store_xgmi_link_status_json_output: + combined_json["link_status"] = self.store_xgmi_link_status_json_output + if self.store_current_partition_json_output: + combined_json["current_partition"] = self.store_current_partition_json_output + if self.store_memory_partition_json_output: + combined_json["memory_partition"] = self.store_memory_partition_json_output + if self.store_partition_profiles_json_output: + combined_json["partition_profiles"] = self.store_partition_profiles_json_output + if self.store_partition_resources_json_output: + combined_json["partition_resources"] = self.store_partition_resources_json_output + + self.destination == 'stdout' + json_std_output = json.dumps(combined_json, indent=4) + print(json_std_output) + + + def _print_csv_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: + stored_csv_output = self.multiple_device_output + else: + if not isinstance(self.output, list): + stored_csv_output = [self.output] + + if stored_csv_output: + csv_keys = set() + for output in stored_csv_output: + for key in output: + csv_keys.add(key) + + for index, output_dict in enumerate(stored_csv_output): + remaining_keys = csv_keys - set(output_dict.keys()) + for key in remaining_keys: + stored_csv_output[index][key] = "N/A" + + if self.destination == 'stdout': + if stored_csv_output: + # Get the header as a list of the first element to maintain order + csv_header = stored_csv_output[0].keys() + csv_stdout_output = self.CsvStdoutBuilder() + writer = csv.DictWriter(csv_stdout_output, csv_header) + writer.writeheader() + writer.writerows(stored_csv_output) + print(str(csv_stdout_output)) + else: + if watching_output: + with self.destination.open('w', newline = '', encoding="utf-8") as output_file: + if self.watch_output: + csv_keys = set() + for output in self.watch_output: + for key in output: + csv_keys.add(key) + + for index, output_dict in enumerate(self.watch_output): + remaining_keys = csv_keys - set(output_dict.keys()) + for key in remaining_keys: + self.watch_output[index][key] = "N/A" + + # Get the header as a list of the first element to maintain order + csv_header = self.watch_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(self.watch_output) + else: + with self.destination.open('a', newline = '', encoding="utf-8") as output_file: + # Get the header as a list of the first element to maintain order + csv_header = stored_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(stored_csv_output) + + + def _print_dual_csv_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: + stored_csv_output = self.multiple_device_output + else: + if not isinstance(self.output, list): + stored_csv_output = [self.output] + + primary_csv_output = [] + secondary_csv_output = [] + + if stored_csv_output: + # Split stored_csv_output into primary_csv and secondary_csv + for output_dict in stored_csv_output: + if 'process_list' in output_dict: + # Add a new entry for each process_info + for process_info_dict in output_dict['process_list']: + secondary_output_dict = {} + if watching_output: + secondary_output_dict['timestamp'] = output_dict['timestamp'] + secondary_output_dict['gpu'] = output_dict['gpu'] + if isinstance(process_info_dict["process_info"], dict): + for process_field, process_value in process_info_dict["process_info"].items(): + if isinstance(process_value, dict): + for key, value in process_value.items(): + secondary_output_dict[key] = value + else: + secondary_output_dict[process_field] = process_value + else: + # Handle no process found case + secondary_output_dict["process_info"] = process_info_dict["process_info"] + secondary_csv_output.append(secondary_output_dict) + primary_output_dict = {} + for key, value in output_dict.items(): + if key != 'process_list': + primary_output_dict[key] = value + primary_csv_output.append(primary_output_dict) + + # Ensure uniform data within primary and secondary csv outputs + if primary_csv_output: + primary_keys = set() + for output in primary_csv_output: + for key in output: + primary_keys.add(key) + # insert empty data to align with keys that may not exist + for index, output_dict in enumerate(primary_csv_output): + remaining_keys = primary_keys - set(output_dict.keys()) + for key in remaining_keys: + primary_csv_output[index][key] = "N/A" + if secondary_csv_output: + secondary_keys = set() + for output in secondary_csv_output: + for key in output: + secondary_keys.add(key) + # insert empty data to align with keys that may not exist + for index, output_dict in enumerate(secondary_csv_output): + remaining_keys = secondary_keys - set(output_dict.keys()) + for key in remaining_keys: + secondary_csv_output[index][key] = "N/A" + + if self.destination == 'stdout': + if primary_csv_output: + # Get the header as a list of the first element to maintain order + csv_header = primary_csv_output[0].keys() + csv_stdout_output = self.CsvStdoutBuilder() + writer = csv.DictWriter(csv_stdout_output, csv_header) + writer.writeheader() + writer.writerows(primary_csv_output) + print(str(csv_stdout_output)) + if secondary_csv_output: + # Get the header as a list of the first element to maintain order + csv_header = secondary_csv_output[0].keys() + csv_stdout_output = self.CsvStdoutBuilder() + writer = csv.DictWriter(csv_stdout_output, csv_header) + writer.writeheader() + writer.writerows(secondary_csv_output) + print(str(csv_stdout_output)) + if watching_output: + print() + else: + if watching_output: + with self.destination.open('w', newline = '', encoding="utf-8") as output_file: + primary_csv_output = [] + secondary_csv_output = [] + if self.watch_output: + # Split watch_output into primary_csv and secondary_csv + for output_dict in self.watch_output: + if 'process_list' in output_dict: + # Add a new entry for each process_info + for process_info_dict in output_dict['process_list']: + secondary_output_dict = {} + if watching_output: + secondary_output_dict['timestamp'] = output_dict['timestamp'] + secondary_output_dict['gpu'] = output_dict['gpu'] + if isinstance(process_info_dict["process_info"], dict): + for process_field, process_value in process_info_dict["process_info"].items(): + if isinstance(process_value, dict): + for key, value in process_value.items(): + secondary_output_dict[key] = value + else: + secondary_output_dict[process_field] = process_value + else: + # Handle no process found case + secondary_output_dict["process_info"] = process_info_dict["process_info"] + secondary_csv_output.append(secondary_output_dict) + primary_output_dict = {} + for key, value in output_dict.items(): + if key != 'process_list': + primary_output_dict[key] = value + primary_csv_output.append(primary_output_dict) + + # Ensure uniform data within primary and secondary csv outputs + if primary_csv_output: + primary_keys = set() + for output in primary_csv_output: + for key in output: + primary_keys.add(key) + # insert empty data to align with keys that may not exist + for index, output_dict in enumerate(primary_csv_output): + remaining_keys = primary_keys - set(output_dict.keys()) + for key in remaining_keys: + primary_csv_output[index][key] = "N/A" + if secondary_csv_output: + secondary_keys = set() + for output in secondary_csv_output: + for key in output: + secondary_keys.add(key) + # insert empty data to align with keys that may not exist + for index, output_dict in enumerate(secondary_csv_output): + remaining_keys = secondary_keys - set(output_dict.keys()) + for key in remaining_keys: + secondary_csv_output[index][key] = "N/A" + + if primary_csv_output: + # Get the header as a list of the first element to maintain order + csv_header = primary_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(primary_csv_output) + if secondary_csv_output: + output_file.write("\n") + csv_header = secondary_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(secondary_csv_output) + else: + with self.destination.open('a', newline = '', encoding="utf-8") as output_file: + if primary_csv_output: + # Get the header as a list of the first element to maintain order + csv_header = primary_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(primary_csv_output) + if secondary_csv_output: + output_file.write("\n") + csv_header = secondary_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(secondary_csv_output) + + def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False, tabular=False): + # If tabular output is enabled, redirect to _print_tabular_output + if tabular: + self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output) + return + + human_readable_output = '' + + if multiple_device_enabled: + for device_output in self.multiple_device_output: + human_readable_output += self._convert_json_to_human_readable(device_output) + '\n' + else: + human_readable_output += self._convert_json_to_human_readable(self.output) + + if self.destination == 'stdout': + try: + # printing as unicode may fail if locale is not set properly + print(human_readable_output) + except UnicodeEncodeError: + # print as ascii, ignore incompatible characters + print(human_readable_output.encode('ascii', 'ignore').decode('ascii')) + else: + if watching_output: + with self.destination.open('w', encoding="utf-8") as output_file: + human_readable_output = '' + for output in self.watch_output: + human_readable_output += self._convert_json_to_human_readable(output) + output_file.write(human_readable_output + '\n') + else: + with self.destination.open('a', encoding="utf-8") as output_file: + output_file.write(human_readable_output + '\n') + + + def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False, dynamic=False): + primary_table = '' + secondary_table = '' + + # Populate primary table without process_list + # Populate secondary table with process_list if exists + if multiple_device_enabled and self.multiple_device_output: + for device_output in self.multiple_device_output: + if 'process_list' in device_output: + process_table_dict = {} + if watching_output: + process_table_dict['timestamp'] = device_output['timestamp'] + process_table_dict['gpu'] = device_output['gpu'] + process_table_dict['process_list'] = device_output['process_list'] + secondary_table += self._convert_json_to_tabular(process_table_dict) + '\n' + # Add primary table keys without process_list + primary_table_output = {} + for key, value in device_output.items(): + if key != 'process_list': + primary_table_output[key] = value + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' + else: # Single device output + if 'process_list' in self.output: + process_table_dict = {} + if watching_output: + process_table_dict['timestamp'] = self.output['timestamp'] + process_table_dict['gpu'] = self.output['gpu'] + process_table_dict['process_list'] = self.output['process_list'] + secondary_table += self._convert_json_to_tabular(process_table_dict) + '\n' + # Add primary table keys without process_list + primary_table_output = {} + for key, value in self.output.items(): + if key != 'process_list': + primary_table_output[key] = value + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' + primary_table = primary_table.rstrip() + secondary_table = secondary_table.rstrip() + + # Add primary table title and header to primary_table + if primary_table: + primary_table_heading = '' + if self.table_title: + primary_table_heading = self.table_title + ':\n' + if self.warning_message: # Add warning message below the table title + primary_table_heading += self.warning_message + '\n' + primary_table_heading += self.table_header + '\n' + primary_table = primary_table_heading + primary_table + + # Add secondary table title and header to secondary_table + # Currently just process_info uses this logic + if secondary_table: + secondary_table_heading = '' + if self.secondary_table_title: + secondary_table_heading = '\n' + self.secondary_table_title + ':\n' + secondary_table_heading += self.secondary_table_header + '\n' + secondary_table = secondary_table_heading + secondary_table + + if self.destination == 'stdout': + try: + # printing as unicode may fail if locale is not set properly + print(primary_table) + if secondary_table: + print(secondary_table) + if watching_output: + print("\n") + except UnicodeEncodeError: + # print as ascii, ignore incompatible characters + print(primary_table.encode('ascii', 'ignore').decode('ascii')) + if secondary_table: + print(secondary_table.encode('ascii', 'ignore').decode('ascii')) + if watching_output: + print("\n") + else: + if watching_output: # Write all stored watched output to a file + with self.destination.open('w', encoding="utf-8") as output_file: + primary_table = '' + secondary_table = '' + # Add process_list to the secondary_table + # Add remaining watch_output to the primary_table + for device_output in self.watch_output: + # if process_list is detected in device_output store in secondary_table + if 'process_list' in device_output: + process_table_dict = { + 'timestamp': device_output['timestamp'], + 'gpu': device_output['gpu'], + 'process_list': device_output['process_list'] + } + secondary_table += self._convert_json_to_tabular(process_table_dict) + '\n' + # Add primary table keys without process_list + primary_table_output = {} + for key, value in device_output.items(): + if key != 'process_list': + primary_table_output[key] = value + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' + primary_table = primary_table.rstrip() # Remove trailing new line + secondary_table = secondary_table.rstrip() + + # Add primary table title and header to primary_table + if primary_table: + primary_table_heading = '' + if self.table_title: + primary_table_heading = self.table_title + ':\n' + if self.warning_message: # Add warning message below the table title + primary_table_heading += self.warning_message + '\n' + primary_table_heading += self.table_header + '\n' + primary_table = primary_table_heading + primary_table + + # Add secondary table title and header to secondary_table + # Currently just process_info uses this logic + if secondary_table: + secondary_table_heading = '' + if self.secondary_table_title: + secondary_table_heading = '\n' + self.secondary_table_title + ':\n' + secondary_table_heading += self.secondary_table_header + '\n' + secondary_table = secondary_table_heading + secondary_table + + # Write both full tables to the file + output_file.write(primary_table) + if secondary_table: + output_file.write("\n" + secondary_table) + else: # Write all singular output to a file + with self.destination.open('a', encoding="utf-8") as output_file: + output_file.write(primary_table + '\n') + output_file.write(secondary_table) + + + def print_default_output(self, output: Dict): + # some template lines + default_line_1 = "+------------------------------------------------------------------------------+" + default_line_2 = "|-------------------------------------+----------------------------------------|" + default_line_3 = "|=====================================+========================================|" + default_line_4 = "+-------------------------------------+----------------------------------------+" + default_line_5 = "|==============================================================================|" + + # print the version information first + amd_smi_version = str(output['version_info']['amd-smi']) + if len(amd_smi_version) > 20: + amd_smi_version = amd_smi_version[:17] + "..." + rocm_version = "N/A" + if output['version_info']['rocm version'][0]: + rocm_version = str(output['version_info']['rocm version'][1]).ljust(8) + driver_version = output['version_info']['amdgpu version'] + if driver_version == "N/A": + amdgpu_version = "N/A".ljust(8) + else: + # Example driver version string for amdgpu: 6.8.0-60 : 'Linuxversion6.8.0-60-generic(buildd@lcy02-amd64-098)(x86_64-linux-gnu-gcc-12(Ubuntu12.3.0-1ubuntu1~22.04)12.3.0,GNUld(GNUBinutilsforUbuntu)2.38)#63~22.04.1-UbuntuSMPPREEMPT_DYNAMICTueApr2219:00:15UTC2' + # Extract version before "-generic" if it exists + if '-generic' in driver_version['driver_version']: + # Extract version using regex to find pattern like "6.8.0-60" + match = re.search(r'(\d+\.\d+\.\d+-\d+)', driver_version['driver_version']) + if match: + amdgpu_version = match.group(1).ljust(8) + else: + amdgpu_version = "N/A".ljust(8) + else: + amdgpu_version = str(driver_version['driver_version'])[:8].ljust(8) + fw_pldm_version = str(output['version_info']['fw pldm version']) + vbios_version = str(output['version_info']['vbios version']) + + # print GPU info + print(default_line_1) + print("| AMD-SMI {0:20s} amdgpu version: {1:8s} ROCm version: {2:8s} |".format(amd_smi_version.ljust(20), amdgpu_version, rocm_version)) + + # adjust format depending on whether vbios or fw pldm version is present + if vbios_version != "N/A" and fw_pldm_version != "N/A": + print("| VBIOS version: {0:22s} {1:12s} FW PLDM: {2:15s}|".format(vbios_version, "", fw_pldm_version)) + elif vbios_version != "N/A" and fw_pldm_version == "N/A": + print("| VBIOS version: {0:22s} {1:37s} |".format(vbios_version, "")) + elif fw_pldm_version != "N/A" and vbios_version == "N/A": + print("| FW PLDM: {0:15s} {1:50s} |".format(fw_pldm_version, "")) + else: + pass # Both VBIOS and FW PLDM versions are "N/A" so skip this line + + print("| Platform: {0:25.25s} {1:41s}|".format(str(self.helpers.os_info()), "")) + print(default_line_2) + print("| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |") + print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |") + print(default_line_3) + + line_count = 0 + end = len(output['gpu_info_list']) - 1 + + for gpu_info in output['gpu_info_list']: + bdf = str(gpu_info['bdf']).ljust(12) + + market_name = str(gpu_info['market_name']) + if len(market_name) > 22: + market_name = ("..." + market_name[-19:]) + market_name = market_name.rjust(22) + + mem_util = gpu_info['mem_util'] + if mem_util != "N/A": + mem_util = str(mem_util) + " %" + mem_util = mem_util.ljust(5) + + temp = gpu_info['temp'] + if temp != "N/A": + temp = str(temp) + " \u00b0C" + temp = temp.rjust(6) + + u_ecc = str(gpu_info['uncorr_ecc']).ljust(5) + + power_usage = gpu_info['power_usage'] + if power_usage != "N/A": + power_usage = f"{gpu_info['power_usage']['current_power']}/{gpu_info['power_usage']['power_limit']} W" + power_usage = str(power_usage).rjust(13) + + gpu_id = str(gpu_info['gpu_id']).rjust(3) + hip_id = str(gpu_info['hip_id']).rjust(6) + oam_id = str(gpu_info['oam_id']).rjust(6) + partition_modes = str(gpu_info['partition_mode']).rjust(14) + + gfx_util = gpu_info['gfx_util'] + if gfx_util != "N/A": + gfx_util = str(gfx_util) + " %" + gfx_util = gfx_util.ljust(5) + + fan = gpu_info['fan'] + if fan != "N/A": + fan = str(fan) + " %" + fan = fan.rjust(6) + + mem_usage = gpu_info['mem_usage'] + if mem_usage != "N/A": + mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB" + mem_usage = mem_usage.rjust(21) + + print("| {0:12.12s} {1:22.22s} | {2:5.5s} {3:6.6s} {4:5.5s} {5:13.13s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage)) + print("| {0:3.3s} {1:6.6s} {2:6.6s} {3:14.14s} | {4:5.5s} {5:6.6s} {6:21.21s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage)) + + if line_count < end: + print(default_line_2) + line_count += 1 + + print(default_line_4) + + # print process list of all GPUs last + print(default_line_1) + print("| Processes: |") + print("| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE CU % |") + print(default_line_5) + elevated_permission_error = False + if len(output['processes']) != 0: + for process in output['processes']: + gpu_id = str(process['gpu']).rjust(4) + pid = str(process['pid']).rjust(9) + if str(process['name']) == "N/A": + process_name = "N/A".ljust(19) + else: + process_name = str(process['name']).split('/')[-1].ljust(19) + gtt_mem = str(process['gtt']).rjust(8) + vram_mem = str(process['vram']).rjust(8) + mem_usage = str(process['mem_usage']).rjust(9) + if process['cu_occupancy']['total_num_cu'] != "N/A" and process['cu_occupancy']['current_cu'] != "N/A": + cu_occupancy = (str(round(process['cu_occupancy']['current_cu'] / process['cu_occupancy']['total_num_cu'] * 100, 1)) + " %").rjust(7) + else: + cu_occupancy = "N/A" + print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} |".format( + gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy)) + if process['name'] == "N/A": + elevated_permission_error = True + else: + print("| No running processes found |") + print(default_line_1) + if elevated_permission_error: + print("Process Name may require elevated permissions.") diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py new file mode 100644 index 0000000000..98370bbd7d --- /dev/null +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -0,0 +1,1613 @@ +#!/usr/bin/env python3 +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import errno +import os +import sys +import time +import collections +from amdsmi import amdsmi_interface +from typing import Optional +from typing import Union + +from pathlib import Path + +from _version import __version__ +from amdsmi_helpers import AMDSMIHelpers +import amdsmi_cli_exceptions + + +# Custom Help Formatter for increasing the action max length +class AMDSMIParserHelpFormatter(argparse.HelpFormatter): + def __init__(self, prog): + super().__init__(prog=prog, + indent_increment=2, + max_help_position=24, + width=90) + self._action_max_length = 20 + + +# Custom Help Formatter for not duplicating the metavar in the subparsers +class AMDSMISubparserHelpFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog): + super().__init__(prog, indent_increment=2, max_help_position=80, width=90) + self._action_max_length = 20 + + def _format_action_invocation(self, action): + if not action.option_strings or action.nargs == 0: + return super()._format_action_invocation(action) + default = self._get_default_metavar_for_optional(action) + args_string = self._format_args(action, default) + return ', '.join(action.option_strings) + ' ' + args_string + + +class AMDSMIParser(argparse.ArgumentParser): + """Unified Parser for AMDSMI CLI. + This parser doesn't access amdsmi's lib directly,but via AMDSMIHelpers, + this allows for us to use this parser with future OS & Platform integration. + + Args: + argparse (ArgumentParser): argparse.ArgumentParser + """ + def __init__(self, version, list, static, firmware, bad_pages, metric, + process, profile, event, topology, set_value, reset, monitor, + xgmi, partition, ras, node, default, sys_argv=None, + helpers=None): + + # Helper variables + if helpers is None: + # If helpers is not provided, create a new instance + self.helpers = AMDSMIHelpers() + else: + self.helpers = helpers + + # Get choices based on driver initialized + if self.helpers.is_amdgpu_initialized(): + self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices() + else: + self.gpu_choices = {} + self.gpu_choices_str = "" + + if self.helpers.is_amd_hsmp_initialized(): + self.cpu_choices, self.cpu_choices_str = self.helpers.get_cpu_choices() + self.core_choices, self.core_choices_str = self.helpers.get_core_choices() + else: + self.cpu_choices = {} + self.cpu_choices_str = "" + self.core_choices = {} + self.core_choices_str = "" + + self.vf_choices = ['3', '2', '1'] + + self.version_string = f"Version: {__version__}" + self.platform_string = f"Platform: {self.helpers.os_info()}" + self.rocm_version = self.helpers.get_rocm_version() + self.rocm_version_string = f"ROCm version: {self.rocm_version}" + self.program_name = 'amd-smi' + self.description = f"AMD System Management Interface | {self.version_string} | {self.rocm_version_string} | {self.platform_string}" + + # Adjust argument parser options + super().__init__( + formatter_class= lambda prog: AMDSMIParserHelpFormatter(prog), + description=self.description, + epilog="For detailed help on specific commands: amd-smi [command] -h", + add_help=True, + prog=self.program_name) + + # Setup subparsers + self.subparsers = self.add_subparsers( + title="AMD-SMI Commands", + parser_class=argparse.ArgumentParser, + help="Descriptions:", + metavar='') + + # Store possible subcommands & aliases for later errors + self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages', + 'metric', 'process', 'profile', 'event', 'topology', 'set', + 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', + 'node', 'default'] + + # Add all subparsers + if sys_argv is not None: + if any(arg in sys_argv for arg in ['--help', '-h']): + self._add_version_parser(self.subparsers, version) + self._add_list_parser(self.subparsers, list) + self._add_static_parser(self.subparsers, static) + self._add_firmware_parser(self.subparsers, firmware) + self._add_bad_pages_parser(self.subparsers, bad_pages) + self._add_metric_parser(self.subparsers, metric) + self._add_process_parser(self.subparsers, process) + self._add_profile_parser(self.subparsers, profile) + self._add_event_parser(self.subparsers, event) + self._add_topology_parser(self.subparsers, topology) + self._add_set_value_parser(self.subparsers, set_value) + self._add_reset_parser(self.subparsers, reset) + self._add_monitor_parser(self.subparsers, monitor) + self._add_xgmi_parser(self.subparsers, xgmi) + self._add_partition_parser(self.subparsers, partition) + self._add_ras_parser(self.subparsers, ras) + self._add_node_parser(self.subparsers, node) + elif any(arg in sys_argv for arg in ['version']): + self._add_version_parser(self.subparsers, version) + elif any(arg in sys_argv for arg in ['list']): + self._add_list_parser(self.subparsers, list) + elif any(arg in sys_argv for arg in ['static']): + self._add_static_parser(self.subparsers, static) + elif any(arg in sys_argv for arg in ['firmware', 'ucode']): + self._add_firmware_parser(self.subparsers, firmware) + elif any(arg in sys_argv for arg in ['bad-pages']): + self._add_bad_pages_parser(self.subparsers, bad_pages) + elif any(arg in sys_argv for arg in ['metric']): + self._add_metric_parser(self.subparsers, metric) + elif any(arg in sys_argv for arg in ['process']): + self._add_process_parser(self.subparsers, process) + elif any(arg in sys_argv for arg in ['profile']): + self._add_profile_parser(self.subparsers, profile) + elif any(arg in sys_argv for arg in ['event']): + self._add_event_parser(self.subparsers, event) + elif any(arg in sys_argv for arg in ['topology']): + self._add_topology_parser(self.subparsers, topology) + elif any(arg in sys_argv for arg in ['set']): + self._add_set_value_parser(self.subparsers, set_value) + elif any(arg in sys_argv for arg in ['reset']): + self._add_reset_parser(self.subparsers, reset) + elif any(arg in sys_argv for arg in ['monitor', 'dmon']): + self._add_monitor_parser(self.subparsers, monitor) + elif any(arg in sys_argv for arg in ['xgmi']): + self._add_xgmi_parser(self.subparsers, xgmi) + elif any(arg in sys_argv for arg in ['partition']): + self._add_partition_parser(self.subparsers, partition) + elif any(arg in sys_argv for arg in ['ras']): + self._add_ras_parser(self.subparsers, ras) + elif any(arg in sys_argv for arg in ['node']): + self._add_node_parser(self.subparsers, node) + else: + # If no subcommand is given, add the default parser + self._add_default_parser(self.subparsers, default) + + def _not_negative_int(self, int_value, sub_arg=None): + # Argument type validator + if int_value.isdigit(): # Is digit doesn't work on negative numbers + return int(int_value) + + outputformat = self.helpers.get_output_format() + if int_value == "": + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], int_value, outputformat) + + + def _positive_int(self, int_value, sub_arg=None): + # Argument type validator + if int_value.isdigit(): # Is digit doesn't work on negative numbers + if int(int_value) > 0: + return int(int_value) + + outputformat = self.helpers.get_output_format() + if int_value == "": + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], int_value, outputformat) + + + def _is_valid_string(self, string_value, sub_arg=None): + # Argument type validator + # This is for triggering a cli exception if an empty string is detected + if string_value: + return string_value + + outputformat = self.helpers.get_output_format() + if string_value == "": + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(sub_arg, outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], string_value, outputformat) + + + def _is_command_supported(self, user_input, acceptable_values, command_name): + if acceptable_values == "N/A": + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiPermissionDeniedException(command_name, outputformat) + elif str(user_input).upper() not in acceptable_values: + print(f"Valid inputs are {acceptable_values}") + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], str(user_input).upper(), self.helpers.get_output_format()) + else: + return str(user_input).upper() + + + def _limit_select(self): + """Custom action for setting clock limits""" + output_format = self.helpers.get_output_format() + + class AMDSMILimitArgs(argparse.Action): + def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, + values: Union[str, list, None], option_string: Optional[str] = None) -> None: + # valid values + valid_clk_types = ('sclk', 'mclk') + valid_lim_types = ('min', 'max') + clk_type, lim_type, val = values + + # Check if the sclk and mclk parameters are valid + if clk_type not in valid_clk_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], clk_type, output_format) + if lim_type not in valid_lim_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], lim_type, output_format) + + # Check if the val is a valid integer value + if not val.isdigit(): + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], val, output_format) + val = int(val) + if val < 0: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], val, output_format) + clk_limit_args = collections.namedtuple('clk_limit_args', ['clk_type', 'lim_type', 'val']) + setattr(namespace, self.dest, clk_limit_args(clk_type, lim_type, val)) + return AMDSMILimitArgs + + + def _level_select(self): + """Custom action for setting clock frequencies to particular performance level""" + output_format = self.helpers.get_output_format() + + class AMDSMIFreqArgs(argparse.Action): + def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, + values: list, option_string: Optional[str] = None) -> None: + # valid values + valid_clk_types = ('sclk', 'mclk', 'pcie', 'fclk', 'socclk') + clk_type = values[0] + perf_levels_str = values[1:] + + # Check if the sclk and mclk parameters are valid + if clk_type not in valid_clk_types: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], clk_type, output_format) + + perf_levels = [] + # Check if every item in perf level is valid + for level in perf_levels_str: + if not level.isdigit(): + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], level, output_format) + level = int(level) + if level < 0: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], level, output_format) + perf_levels.append(level) + + clk_level_args = collections.namedtuple('clk_level_args', ['clk_type', 'perf_levels']) + setattr(namespace, self.dest, clk_level_args(clk_type, perf_levels)) + return AMDSMIFreqArgs + + + def _power_cap_options(self): + """Custom action for setting power cap options""" + output_format = self.helpers.get_output_format() + + class AMDSMIPowerCapArgs(argparse.Action): + def __call__(self, parser: AMDSMIParser, namespace: argparse.Namespace, + values: list, option_string: Optional[str] = None) -> None: + if len(values) != 2: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], values, output_format) + + power_cap_type = values[0] + power_cap_value = values[1] + + if power_cap_type not in ['ppt0', 'ppt1']: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], power_cap_type, output_format) + + if not power_cap_value.isdigit(): + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], power_cap_value, output_format) + + power_cap_args = collections.namedtuple('power_cap_args', ['pwr_type', 'watts']) + setattr(namespace, self.dest, power_cap_args(power_cap_type, int(power_cap_value))) + return AMDSMIPowerCapArgs + + + def _check_folder_path(self): + """ Argument action validator: + Returns a path to folder from the folder path provided. + If the path doesn't exist create it. + """ + class CheckOutputFilePath(argparse.Action): + outputformat = self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + path = Path(values) + try: + path.mkdir(parents=True, exist_ok=True) + except OSError as e: + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, + CheckOutputFilePath.outputformat, + f"Unable to make '{path}' a folder.") + if not path.exists(): + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) + elif path.is_dir(): + setattr(args, self.dest, path) + return CheckOutputFilePath + + + def _check_output_file_path(self): + """ Argument action validator: + Returns a path to a file from the output file path provided. + If the path is a directory then create a file within it and return that file path + If the path is a file and it exists return the file path + If the path is a file and it doesn't exist create and return the file path + """ + class CheckOutputFilePath(argparse.Action): + outputformat = self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + path = Path(values) + if not path.exists(): + if path.parent.is_dir(): + path.touch() + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) + + if path.is_dir(): + file_name = f"{int(time.time())}-amdsmi-output" + if args.json: + file_name += ".json" + elif args.csv: + file_name += ".csv" + else: + file_name += ".txt" + path = path / file_name + path.touch() + setattr(args, self.dest, path) + elif path.is_file(): + path.touch() + setattr(args, self.dest, path) + else: + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, CheckOutputFilePath.outputformat) + return CheckOutputFilePath + + + def _check_cper_file_path(self): + """ Argument action validator: + Returns a path to a file from the input file path provided. + If the file doesn't exist, is empty, or is invalid, raise an error. + """ + class _CheckInputFilePath(argparse.Action): + # Checks the values + outputformat=self.helpers.get_output_format() + def __call__(self, parser, args, values, option_string=None): + path = Path(values) + try: + if not path.exists(): + raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct.") + if path.is_dir(): + raise IsADirectoryError(f"Invalid Path: {path} is a directory when it needs to be a specific file.") + if path.is_file(): + if os.stat(values).st_size == 0: + raise ValueError(f"Invalid Path: {path} Input file is empty.") + setattr(args, self.dest, path) + else: + raise FileNotFoundError(f"Invalid Path: {path} Could not determine if the value given is a valid path.") + except Exception as root_cause: + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, _CheckInputFilePath.outputformat) from root_cause + return _CheckInputFilePath + + + def _check_watch_selected(self): + """ Validate that the -w/--watch argument was selected + This is because -W/--watch_time and -i/--iterations are dependent on watch + """ + class WatchSelectedAction(argparse.Action): + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if args.watch is None: + raise argparse.ArgumentError(self, f"invalid argument: '{self.dest}' needs to be paired with -w/--watch") + else: + setattr(args, self.dest, values) + return WatchSelectedAction + + + def _gpu_select(self, gpu_choices): + """ Custom argparse action to return the device handle(s) for the gpu(s) selected + This will set the destination (args.gpu) to a list of 1 or more device handles + If 1 or more device handles are not found then raise an ArgumentError for the first invalid gpu seen + """ + + amdsmi_helpers = self.helpers + class _GPUSelectAction(argparse.Action): + outputformat=self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if "all" in gpu_choices: + del gpu_choices["all"] + status, gpu_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_gpu_selections(gpu_selections=values, + gpu_choices=gpu_choices) + if status: + setattr(args, self.dest, selected_device_handles) + else: + if selected_device_handles == '': + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", _GPUSelectAction.outputformat) + elif not gpu_format: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, + _GPUSelectAction.outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, + _GPUSelectAction.outputformat, + True, False, False) + + return _GPUSelectAction + + + def _cpu_select(self, cpu_choices): + """ Custom argparse action to return the device handle(s) for the cpu(s) selected + This will set the destination (args.cpu) to a list of 1 or more device handles + If 1 or more device handles are not found then raise an ArgumentError for the first invalid cpu seen + """ + amdsmi_helpers = self.helpers + class _CPUSelectAction(argparse.Action): + outputformat=self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if "all" in cpu_choices: + del cpu_choices["all"] + status, cpu_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_cpu_selections(cpu_selections=values, + cpu_choices=cpu_choices) + if status: + setattr(args, self.dest, selected_device_handles) + else: + if selected_device_handles == '': + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--cpu", _CPUSelectAction.outputformat) + elif not cpu_format: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, + _CPUSelectAction.outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, + _CPUSelectAction.outputformat, + False, True, False) + return _CPUSelectAction + + + def _core_select(self, core_choices): + """ Custom argparse action to return the device handle(s) for the core(s) selected + This will set the destination (args.core) to a list of 1 or more device handles + If 1 or more device handles are not found then raise an ArgumentError for the first invalid core seen + """ + amdsmi_helpers = self.helpers + class _CoreSelectAction(argparse.Action): + outputformat=self.helpers.get_output_format() + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if "all" in core_choices: + del core_choices["all"] + status, core_format, selected_device_handles = amdsmi_helpers.get_device_handles_from_core_selections(core_selections=values, + core_choices=core_choices) + if status: + setattr(args, self.dest, selected_device_handles) + else: + if selected_device_handles == '': + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--core", _CoreSelectAction.outputformat) + elif not core_format: + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], selected_device_handles, + _CoreSelectAction.outputformat) + else: + raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, + _CoreSelectAction.outputformat, + False, False, True) + return _CoreSelectAction + + + def _add_watch_arguments(self, subcommand_parser): + # Device arguments help text + watch_help = "Reprint the command in a loop of INTERVAL seconds" + watch_time_help = "The total TIME to watch the given command" + iterations_help = "Total number of ITERATIONS to loop on the given command" + + # Mutually Exclusive Args within the subparser + subcommand_parser.add_argument('-w', '--watch', action='store', metavar='INTERVAL', + type=lambda value: self._positive_int(value, '--watch'), required=False, help=watch_help) + subcommand_parser.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='TIME', + type=lambda value: self._positive_int(value, '--watch_time'), required=False, help=watch_time_help) + subcommand_parser.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='ITERATIONS', + type=lambda value: self._positive_int(value, '--iterations'), required=False, help=iterations_help) + + + def _validate_cpu_core(self, value): + if value == '': + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(value, outputformat) + if isinstance(value, str): + if value.lower() == "all": + return value + if value.isdigit(): + if int(value) < 0: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) + else: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) + + if isinstance(value, int): + if int(value) < 0: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(sys.argv[1], value, outputformat) + + return value + + + def _validate_set_clock(self, validate_clock_type=True): + """ Validate Clock input""" + amdsmi_helpers = self.helpers + class _ValidateClockType(argparse.Action): + # Checks the clock type and clock values + def __call__(self, parser, args, values, option_string=None): + if validate_clock_type: + clock_type = values[0] + clock_types = amdsmi_helpers.get_clock_types()[0] + valid_clock_type, amdsmi_clock_type = amdsmi_helpers.validate_clock_type(input_clock_type=clock_type) + if not valid_clock_type: + raise argparse.ArgumentError(self, f"Invalid argument: '{clock_type}' needs to be a valid clock type:{clock_types}") + + clock_levels = values[1:] + else: + clock_levels = values + + freq_bitmask = 0 + for level in clock_levels: + if level > 63: + raise argparse.ArgumentError(self, f"Invalid argument: '{level}' needs to be a valid clock level 0-63") + freq_bitmask |= (1 << level) + + if validate_clock_type: + setattr(args, self.dest, (amdsmi_clock_type, freq_bitmask)) + else: + setattr(args, self.dest, freq_bitmask) + return _ValidateClockType + + + def _prompt_spec_warning(self): + """ Prompt out of spec warning""" + amdsmi_helpers = self.helpers + class _PromptSpecWarning(argparse.Action): + # Checks the values + def __call__(self, parser, args, values, option_string=None): + amdsmi_helpers.confirm_out_of_spec_warning() + setattr(args, self.dest, values) + return _PromptSpecWarning + + @staticmethod + def _custom_ceil(x): + """ Custom ceiling function to round up float values to the nearest integer. + This is used to ensure that fan speed percentages are rounded up correctly. + """ + if x == int(x): # If x is already an integer + return int(x) + elif x > 0: # For positive numbers, floor division + 1 + return int(x) + 1 + else: # For negative numbers, floor division directly gives the ceiling + return int(x) + + def _validate_fan_speed(self): + """ Validate fan speed input""" + amdsmi_helpers = self.helpers + class _ValidateFanSpeed(argparse.Action): + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if isinstance(values, str): + # Convert percentage to fan level + if '%' in values: + try: + amdsmi_helpers.confirm_out_of_spec_warning() + # Convert percentage to fan speed level + values = (int(values[:-1]) / 100) * 255 + values = AMDSMIParser._custom_ceil(values) # Round up (Ceiling) + setattr(args, self.dest, values) + except ValueError as e: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-100%") + else: # Store the fan level as fan_speed + values = int(values) + if 0 <= values <= 255: + amdsmi_helpers.confirm_out_of_spec_warning() + setattr(args, self.dest, values) + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255") + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-255 or 0-100%") + return _ValidateFanSpeed + + + def _validate_overdrive_percent(self): + """ Validate overdrive percentage input""" + amdsmi_helpers = self.helpers + class _ValidateOverdrivePercent(argparse.Action): + # Checks the values + def __call__(self, parser, args, values, option_string=None): + if isinstance(values, str): + try: + if values[-1] == '%': + over_drive_percent = int(values[:-1]) + else: + over_drive_percent = int(values) + + if 0 <= over_drive_percent <= 20: + amdsmi_helpers.confirm_out_of_spec_warning() + setattr(args, self.dest, over_drive_percent) + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be within range 0-20 or 0-20%") + except ValueError: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") + else: + raise argparse.ArgumentError(self, f"Invalid argument: '{values}' needs to be 0-20 or 0-20%") + return _ValidateOverdrivePercent + + +### Building parsers ### + def _add_device_arguments(self, subcommand_parser: argparse.ArgumentParser, required=False): + # Device arguments help text + gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}" + vf_help = "Gets general information about the specified VF (timeslice, fb info, …).\ + \nAvailable only on virtualization OSs" + cpu_help = f"Select a CPU ID from the possible choices:\n{self.cpu_choices_str}" + core_help = f"Select a Core ID from the possible choices:\n{self.core_choices_str}" + + # Create argument group for all the devices + device_group = subcommand_parser.add_argument_group('Device Arguments') + + # Mutually Exclusive Args within the subparser + device_args = device_group.add_mutually_exclusive_group(required=required) + + if self.helpers.is_amdgpu_initialized(): + device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices), + nargs='+', help=gpu_help) + + if self.helpers.is_amd_hsmp_initialized(): + device_args.add_argument('-U', '--cpu', type=self._validate_cpu_core, + action=self._cpu_select(self.cpu_choices), + nargs='+', help=cpu_help) + if subcommand_parser._optionals.title != "Static Arguments": + device_args.add_argument('-O', '--core', type=self._validate_cpu_core, + action=self._core_select(self.core_choices), + nargs='+', help=core_help) + + if self.helpers.is_hypervisor(): + device_args.add_argument('-v', '--vf', action='store', nargs='+', + help=vf_help, choices=self.vf_choices) + + + def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser): + json_help = "Displays output in JSON format" + csv_help = "Displays output in CSV format" + file_help = "Saves output into a file on the provided path" + loglevel_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + loglevel_choices_str = ", ".join(loglevel_choices) + loglevel_help = ( + f"Set the logging level from the possible choices:\n " + f"{loglevel_choices_str}" + ) + + command_modifier_group = subcommand_parser.add_argument_group('Command Modifiers') + + # Output Format options + logging_args = command_modifier_group.add_mutually_exclusive_group() + logging_args.add_argument('--json', action='store_true', required=False, help=json_help) + logging_args.add_argument('--csv', action='store_true', required=False, help=csv_help) + + command_modifier_group.add_argument('--file', action=self._check_output_file_path(), type=str, required=False, help=file_help) + # Placing loglevel outside the subcommands so it can be used with any subcommand + command_modifier_group.add_argument('--loglevel', action='store', type=str.upper, required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', + choices=loglevel_choices) + + return command_modifier_group + + + def _add_watch_arguments(self, subcommand_parser: argparse.ArgumentParser): + # Device arguments help text + watch_help = "Reprint the command in a loop of INTERVAL seconds" + watch_time_help = "The total duration of TIME to watch the command" + iterations_help = "The total number of ITERATIONS to repeat the command" + + watch_arguments_group = subcommand_parser.add_argument_group('Watch Arguments') + + # Mutually Exclusive Args within the subparser + watch_arguments_group.add_argument('-w', '--watch', action='store', metavar='INTERVAL', + type=lambda value: self._positive_int(value, '--watch'), required=False, help=watch_help) + watch_arguments_group.add_argument('-W', '--watch_time', action=self._check_watch_selected(), metavar='TIME', + type=lambda value: self._positive_int(value, '--watch_time'), required=False, help=watch_time_help) + watch_arguments_group.add_argument('-i', '--iterations', action=self._check_watch_selected(), metavar='ITERATIONS', + type=lambda value: self._positive_int(value, '--iterations'), required=False, help=iterations_help) + + return watch_arguments_group + + def _add_default_parser(self, subparsers: argparse._SubParsersAction, func): + # there should be no args to parse here so let this be a dummy function to preserve later logic + default_parser = subparsers.add_parser('default', description=None) + default_parser._optionals.title = None + default_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + default_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(default_parser) + + + def _add_version_parser(self, subparsers: argparse._SubParsersAction, func): + # Subparser help text + version_help = "Display version information" + description = self.description + + # Create version subparser + version_parser = subparsers.add_parser('version', help=version_help, description=description) + version_parser._optionals.title = None + version_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + version_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(version_parser) + + # help info + gpu_version_help = "Display the current amdgpu driver version" + cpu_version_help = "Display the current amd_hsmp or hsmp_acpi driver version" + + # Add GPU and CPU version Arguments + version_parser.add_argument('-g', '--gpu_version', action='store_true', required=False, help=gpu_version_help, default=None) + version_parser.add_argument('-c', '--cpu_version', action='store_true', required=False, help=cpu_version_help, default=None) + + + def _add_list_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The list subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + list_help = "List GPU information" + list_optionals_title = "List Arguments" + list_subcommand_help = f"{self.description}\n\nLists all detected devices on the system.\ + \nLists the BDF, UUID, KFD_ID, NODE_ID, and Partition ID for each GPU and/or CPUs.\ + \nIn virtualization environments, it can also list VFs associated to each\ + \nGPU with some basic information for each VF." + enumeration_help = "Enumeration mapping to other features.\ + \n Includes CARD, RENDER, HSA_ID, HIP_ID, and HIP_UUID." + + # Create list subparser + list_parser = subparsers.add_parser('list', help=list_help, description=list_subcommand_help) + list_parser._optionals.title = list_optionals_title + list_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + list_parser.set_defaults(func=func) + + # Create -e subparser + list_parser.add_argument("-e", action="store_true", help=enumeration_help) + + # Add Universal Arguments + self._add_device_arguments(list_parser, required=False) + self._add_command_modifiers(list_parser) + + + def _add_static_parser(self, subparsers: argparse._SubParsersAction, func): + # Subparser help text + static_help = "Gets static information about the specified GPU" + static_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns static information for all GPUs on the system.\ + \nIf no static argument is provided, all static information will be displayed." + static_optionals_title = "Static Arguments" + + # Optional arguments help text + asic_help = "All asic information" + bus_help = "All bus information" + vbios_help = "All video bios/IFWI information (if available)" + limit_help = "All limit metric values (i.e. power and thermal limits)" + driver_help = "Displays driver version" + vram_help = "All vram information" + cache_help = "All cache information" + board_help = "All board information" + soc_pstate_help = "The available soc pstate policy" + xgmi_plpd_help = "The available XGMI per-link power down policy" + process_isolation_help = "The process isolation status" + clk_options = self.helpers.get_clock_types()[0] + clk_options.remove('PCIE') + clk_option_str = ", ".join(clk_options) + ", ALL" + clock_help = f"Show one or more valid clock frequency levels. Available options:\n\t{clk_option_str}" + + # Options arguments help text for Hypervisors and Baremetal + # Might be able to remove Sudo requirement in ROCm 7.0 + ras_help = "Displays RAS features information;\n\tSudo may be required for some features" + numa_help = "All numa node information" # Linux Baremetal only + partition_help = "Partition information:\n\t" \ + "No longer available in default output.\n\tArgument is required to display." \ + "\n\tEx. `amd-smi static -p` or use" \ + "\n\t`amd-smi partition -c -m`/`sudo amd-smi partition -a`" + + # Options arguments help text for Hypervisors + dfc_help = "All DFC FW table information" + fb_help = "Displays Frame Buffer information" + num_vf_help = "Displays number of supported and enabled VFs" + + # Options arguments help text for CPUs + smu_help = "All SMU FW information" + interface_help = "Displays hsmp interface version" + + # Create static subparser + static_parser = subparsers.add_parser('static', help=static_help, description=static_subcommand_help) + static_parser._optionals.title = static_optionals_title + static_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + static_parser.set_defaults(func=func) + + # Handle GPU Options + if self.helpers.is_amdgpu_initialized(): + static_parser.add_argument('-a', '--asic', action='store_true', required=False, help=asic_help) + static_parser.add_argument('-b', '--bus', action='store_true', required=False, help=bus_help) + # Accept vbios args without displaying them + static_parser.add_argument('-V', '--vbios', dest='vbios', action='store_true', required=False, help=argparse.SUPPRESS) + static_parser.add_argument('-I', '--ifwi', dest='vbios', action='store_true', required=False, help=vbios_help) + static_parser.add_argument('-d', '--driver', action='store_true', required=False, help=driver_help) + static_parser.add_argument('-v', '--vram', action='store_true', required=False, help=vram_help) + static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) + static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) + static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) + static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) + static_parser.add_argument('-C', '--clock', action='store', default=False, nargs='*', type=str, required=False, help=clock_help) + static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) + + # Options to display on Hypervisors and Baremetal + if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): + static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help) + static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) + + if self.helpers.is_linux() and not self.helpers.is_virtual_os(): + static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) + + # Options to only display on a Hypervisor TODO: Add hypervisor driver check + if self.helpers.is_hypervisor(): + static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help) + static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) + static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) + + # Handle CPU Options + if self.helpers.is_amd_hsmp_initialized(): + cpu_group = static_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('-s', '--smu', action='store_true', required=False, help=smu_help) + cpu_group.add_argument('-i', '--interface-ver', action='store_true', required=False, help=interface_help) + + # Add Universal Arguments + self._add_device_arguments(static_parser, required=False) + self._add_command_modifiers(static_parser) + + + def _add_firmware_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The firmware subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + firmware_help = "Gets firmware information about the specified GPU" + firmware_subcommand_help = f"{self.description}\n\nIf no GPU is specified, return firmware information for all GPUs on the system." + firmware_optionals_title = "Firmware Arguments" + + # Optional arguments help text + fw_list_help = "All FW list information" + err_records_help = "All error records information" + + # Create firmware subparser + firmware_parser = subparsers.add_parser('firmware', help=firmware_help, description=firmware_subcommand_help, aliases=['ucode']) + firmware_parser._optionals.title = firmware_optionals_title + firmware_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + firmware_parser.set_defaults(func=func) + + # Optional Args + firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True) + + # Options to only display on a Hypervisor + if self.helpers.is_hypervisor(): + firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) + + # Add Universal Arguments + self._add_device_arguments(firmware_parser, required=False) + self._add_command_modifiers(firmware_parser) + + + def _add_bad_pages_parser(self, subparsers: argparse._SubParsersAction, func): + if not (self.helpers.is_baremetal() and self.helpers.is_linux()): + # The bad_pages subcommand is only applicable to Linux Baremetal systems + return + + if not self.helpers.is_amdgpu_initialized(): + # The bad_pages subcommand is only applicable to systems with amdgpu initialized + return + + + # Subparser help text + bad_pages_help = "Gets bad page information about the specified GPU" + bad_pages_subcommand_help = f"{self.description}\n\nIf no GPU is specified, return bad page information for all GPUs on the system." + bad_pages_optionals_title = "Bad Pages Arguments" + + # Optional arguments help text + pending_help = "Displays all pending retired pages" + retired_help = "Displays retired pages" + un_res_help = "Displays unreservable pages" + + # Create bad_pages subparser + bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help) + bad_pages_parser._optionals.title = bad_pages_optionals_title + bad_pages_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + bad_pages_parser.set_defaults(func=func) + + # Optional Args + bad_pages_parser.add_argument('-p', '--pending', action='store_true', required=False, help=pending_help) + bad_pages_parser.add_argument('-r', '--retired', action='store_true', required=False, help=retired_help) + bad_pages_parser.add_argument('-u', '--un-res', action='store_true', required=False, help=un_res_help) + + # Add Universal Arguments + self._add_device_arguments(bad_pages_parser, required=False) + self._add_command_modifiers(bad_pages_parser) + + def _add_metric_parser(self, subparsers: argparse._SubParsersAction, func): + # Subparser help text + metric_help = "Gets metric/performance information about the specified GPU" + metric_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns metric information for all GPUs on the system.\ + \nIf no metric argument is provided, all metric information will be displayed." + metric_optionals_title = "Metric arguments" + + # Optional arguments help text + usage_help = "Displays engine usage information" + + # Help text for Arguments only Available on Linux Virtual OS and Baremetal platforms + mem_usage_help = "Memory usage per block" + + # Help text for Arguments only on Hypervisor and Baremetal platforms + power_help = "Current power usage" + clock_help = "Average, max, and current clock frequencies" + temperature_help = "Current temperatures" + ecc_help = "Total number of ECC errors" + ecc_blocks_help = "Number of ECC errors per block" + pcie_help = "Current PCIe speed, width, and replay count" + voltage_help = "GPU voltage" + base_board_help = "base_board temperatures" + gpu_board_help = "gpu_board temperatures" + + # Help text for Arguments only on Linux Baremetal platforms + fan_help = "Current fan speed" + vc_help = "Display voltage curve" + overdrive_help = "Current GFX and MEM clock overdrive level" + perf_level_help = "Current DPM performance level" + xgmi_err_help = "XGMI error information since last read" + energy_help = "Amount of energy consumed" + throttle_help = "Displays throttle accumulators;\n Only available for MI300 or newer ASICs" + + # Help text for Arguments only on Hypervisors + schedule_help = "All scheduling information" + guard_help = "All guard information" + guest_data_help = "All guest data information" + fb_usage_help = "Displays total and used Frame Buffer usage information" + xgmi_help = "Table of current XGMI metrics information" + + # Help text for cpu options + cpu_power_metrics_help = "CPU power metrics" + cpu_proc_help = "Displays prochot status" + cpu_freq_help = "Displays currentFclkMemclk frequencies and cclk frequency limit" + cpu_c0_res_help = "Displays C0 residency" + cpu_lclk_dpm_help = "Displays lclk dpm level range. Requires socket ID and NBOID as inputs" + cpu_pwr_svi_telemetry_rails_help = "Displays svi based telemetry for all rails" + cpu_io_bandwidth_help = "Displays current IO bandwidth for the selected CPU.\ + \n input parameters are bandwidth type(1) and link ID encodings\ + \n i.e. P2, P3, G0 - G7" + cpu_xgmi_bandwidth_help = "Displays current XGMI bandwidth for the selected CPU\ + \n input parameters are bandwidth type(1,2,4) and link ID encodings\ + \n i.e. P2, P3, G0 - G7" + cpu_metrics_ver_help = "Displays metrics table version" + cpu_metrics_table_help = "Displays metric table" + cpu_socket_energy_help = "Displays socket energy for the selected CPU socket" + cpu_ddr_bandwidth_help = "Displays per socket max ddr bw, current utilized bw,\ + \n and current utilized ddr bw in percentage" + cpu_temp_help = "Displays cpu socket temperature" + cpu_dimm_temp_range_rate_help = "Displays dimm temperature range and refresh rate" + cpu_dimm_pow_consumption_help = "Displays dimm power consumption" + cpu_dimm_thermal_sensor_help = "Displays dimm thermal sensor" + + # Help text for core options + core_energy_help = "Displays core energy for the selected core" + core_boost_limit_help = "Get boost limit for the selected cores" + core_curr_active_freq_core_limit_help = "Get Current CCLK limit set per Core" + + # Create metric subparser + metric_parser = subparsers.add_parser('metric', help=metric_help, description=metric_subcommand_help) + metric_parser._optionals.title = metric_optionals_title + metric_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + metric_parser.set_defaults(func=func) + + # Optional Args for Linux Virtual OS and Baremetal systems + if not self.helpers.is_hypervisor() and not self.helpers.is_windows(): + metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help) + + if self.helpers.is_amdgpu_initialized(): + # Optional Args for Hypervisors and Baremetal systems + if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): + metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) + metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) + metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) + metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) + metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) + metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help) + metric_parser.add_argument('-b', '--base-board', action='store_true', required=False, help=base_board_help, default=False) + metric_parser.add_argument('-G', '--gpu-board', action='store_true', required=False, help=gpu_board_help, default=False) + + # Options that only apply to Hypervisors and Baremetal Linux + if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()): + pass + + # Optional Args for Linux Baremetal Systems + if self.helpers.is_baremetal() and self.helpers.is_linux(): + metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) + metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) + metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) + metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) + metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) + metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) + metric_parser.add_argument('-v', '--violation', dest='throttle', action='store_true', required=False, help=throttle_help) + metric_parser.add_argument('-T', '--throttle', dest='throttle', action='store_true', required=False, help=argparse.SUPPRESS) + + # Options to only display to Hypervisors + # Need to resolve the -G for guard, but technically should never intersect since it's VF only + if self.helpers.is_hypervisor(): + metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) + metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help) + metric_parser.add_argument('-u', '--guest-data', action='store_true', required=False, help=guest_data_help) + metric_parser.add_argument('-f', '--fb-usage', action='store_true', required=False, help=fb_usage_help) + metric_parser.add_argument('-m', '--xgmi', action='store_true', required=False, help=xgmi_help) + + if self.helpers.is_amd_hsmp_initialized(): + # Optional Args for CPUs + cpu_group = metric_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('--cpu-power-metrics', action='store_true', required=False, help=cpu_power_metrics_help) + cpu_group.add_argument('--cpu-prochot', action='store_true', required=False, help=cpu_proc_help) + cpu_group.add_argument('--cpu-freq-metrics', action='store_true', required=False, help=cpu_freq_help) + cpu_group.add_argument('--cpu-c0-res', action='store_true', required=False, help=cpu_c0_res_help) + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, + nargs=1, metavar=("NBIOID"), help=cpu_lclk_dpm_help) + cpu_group.add_argument('--cpu-pwr-svi-telemetry-rails', action='store_true', required=False, + help=cpu_pwr_svi_telemetry_rails_help) + cpu_group.add_argument('--cpu-io-bandwidth', action='append', required=False, nargs=2, + metavar=("IO_BW", "LINKID_NAME"), help=cpu_io_bandwidth_help) + cpu_group.add_argument('--cpu-xgmi-bandwidth', action='append', required=False, nargs=2, + metavar=("XGMI_BW", "LINKID_NAME"), help=cpu_xgmi_bandwidth_help) + cpu_group.add_argument('--cpu-metrics-ver', action='store_true', required=False, help=cpu_metrics_ver_help) + cpu_group.add_argument('--cpu-metrics-table', action='store_true', required=False, help=cpu_metrics_table_help) + cpu_group.add_argument('--cpu-socket-energy', action='store_true', required=False, help=cpu_socket_energy_help) + cpu_group.add_argument('--cpu-ddr-bandwidth', action='store_true', required=False, help=cpu_ddr_bandwidth_help) + cpu_group.add_argument('--cpu-temp', action='store_true', required=False, help=cpu_temp_help) + cpu_group.add_argument('--cpu-dimm-temp-range-rate', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_temp_range_rate_help) + cpu_group.add_argument('--cpu-dimm-pow-consumption', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_pow_consumption_help) + cpu_group.add_argument('--cpu-dimm-thermal-sensor', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_thermal_sensor_help) + + # Optional Args for CPU cores + core_group = metric_parser.add_argument_group("CPU Core Arguments") + core_group.add_argument('--core-boost-limit', action='store_true', required=False, help=core_boost_limit_help) + core_group.add_argument('--core-curr-active-freq-core-limit', action='store_true', required=False, + help=core_curr_active_freq_core_limit_help) + core_group.add_argument('--core-energy', action='store_true', required=False, help=core_energy_help) + + # Add Universal Arguments & watch Args + self._add_watch_arguments(metric_parser) + self._add_device_arguments(metric_parser, required=False) + self._add_command_modifiers(metric_parser) + + + def _add_process_parser(self, subparsers: argparse._SubParsersAction, func): + if self.helpers.is_hypervisor(): + # Don't add this subparser on Hypervisors + # This subparser is only available to Guest and Baremetal systems + return + + if not self.helpers.is_amdgpu_initialized(): + # The process subcommand is currently only applicable to systems with amdgpu initialized + return + + # Subparser help text + process_help = "Lists compute process information running on the specified GPU" + process_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ + \nIf no process argument is provided, all process information will be displayed." + process_optionals_title = "Process arguments" + + # Optional Arguments help text + general_help = "pid, process name, memory usage" + engine_help = "All engine usages" + pid_help = "Gets compute process GPU information about the specified process based on Process ID" + name_help = "Gets compute process GPU information about the specified process based on Process Name.\ + \nIf multiple processes have the same name, information is returned for all of them.\ + \nProcess Name may require elevated permissions." + + + # Create process subparser + process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) + process_parser._optionals.title = process_optionals_title + process_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + process_parser.set_defaults(func=func) + + # Optional Args + process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) + process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) + process_parser.add_argument('-p', '--pid', action='store', type=lambda value: self._not_negative_int(value, '--pid'), required=False, help=pid_help) + process_parser.add_argument('-n', '--name', action='store', type=lambda value: self._is_valid_string(value, '--name'), required=False, help=name_help) + + # Add Universal Arguments & watch Args + self._add_watch_arguments(process_parser) + self._add_device_arguments(process_parser, required=False) + self._add_command_modifiers(process_parser) + + + def _add_profile_parser(self, subparsers: argparse._SubParsersAction, func): + if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): + # This subparser only applies to Hypervisors + return + + # Subparser help text + profile_help = "Displays information about all profiles and current profile" + profile_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system." + profile_optionals_title = "Profile Arguments" + + # Create profile subparser + profile_parser = subparsers.add_parser('profile', help=profile_help, description=profile_subcommand_help) + profile_parser._optionals.title = profile_optionals_title + profile_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + profile_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_device_arguments(profile_parser, required=False) + self._add_command_modifiers(profile_parser) + + + def _add_event_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The event subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + event_help = "Displays event information for the given GPU" + event_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns event information for all GPUs on the system." + event_optionals_title = "Event Arguments" + + # Create event subparser + event_parser = subparsers.add_parser('event', help=event_help, description=event_subcommand_help) + event_parser._optionals.title = event_optionals_title + event_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + event_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_device_arguments(event_parser, required=False) + self._add_command_modifiers(event_parser) + + + def _add_topology_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The topology subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + topology_help = "Displays topology information of the devices" + topology_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ + \nIf no topology argument is provided, all topology information will be displayed." + topology_optionals_title = "Topology arguments" + + # Help text for Arguments only on Guest and BM platforms + access_help = "Displays link accessibility between GPUs" + weight_help = "Displays relative weight between GPUs" + hops_help = "Displays the number of hops between GPUs" + link_type_help = "Displays the link type between GPUs" + numa_bw_help = "Display max and min bandwidth between nodes" + coherent_help = "Display cache coherant (or non-coherant) link capability between nodes" + atomics_help = "Display 32 and 64-bit atomic io link capability between nodes" + dma_help = "Display P2P direct memory access (DMA) link capability between nodes" + bi_dir_help = "Display P2P bi-directional link capability between nodes" + + # Create topology subparser + topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) + topology_parser._optionals.title = topology_optionals_title + topology_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + topology_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(topology_parser) + self._add_device_arguments(topology_parser, required=False) + + # Optional Args + topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help) + topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help) + topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) + topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) + topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) + topology_parser.add_argument('-c', '--coherent', action='store_true', required=False, help=coherent_help) + topology_parser.add_argument('-n', '--atomics', action='store_true', required=False, help=atomics_help) + topology_parser.add_argument('-d', '--dma', action='store_true', required=False, help=dma_help) + topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help) + + + def _add_set_value_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_linux(): + # This subparser is only applicable to Linux + return + + # Subparser help text + set_value_help = "Set options for devices" + set_value_subcommand_help = f"{self.description}\n\nIf no GPU is specified, will select all GPUs on the system.\ + \nA set argument must be provided; Multiple set arguments are accepted.\ + \nRequires 'sudo' privileges." + set_value_optionals_title = "Set Arguments" + + # Help text for Arguments only on BM platforms + if self.helpers.is_amdgpu_initialized(): + if self.helpers.is_baremetal(): + set_fan_help = "Set GPU fan speed (0-255 or 0-100%%)" + perf_level_help_choices_str = ", ".join(self.helpers.get_perf_levels()[0][0:-1]) + set_perf_level_help = f"Set one of the following performance levels:\n\t{perf_level_help_choices_str}" + power_profile_choices_str = ", ".join(self.helpers.get_power_profiles()[0:-1]) + set_profile_help = f"Set power profile level (#) or choose one of available profiles:\n\t{power_profile_choices_str}" + set_perf_det_help = "Enable performance determinism mode and set GFXCLK softmax limit (in MHz)" + (accelerator_set_choices, _) = self.helpers.get_accelerator_choices_types_indices() + memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) + accelerator_set_choices_str = ", ".join(accelerator_set_choices) + set_compute_partition_help = f"Set one of the following accelerator TYPE or profile INDEX:\n\t{accelerator_set_choices_str}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values." + set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" + soc_pstate_help_info = ", ".join(self.helpers.get_soc_pstates()) + set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}" + xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies()) + set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}" + set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels." + ppt0_power_cap_min, ppt0_power_cap_max, ppt1_power_cap_min, ppt1_power_cap_max = self.helpers.get_power_caps() + set_power_cap_help = f"Set either PPT0 or PPT1 power capacity limit:\n\tEx: `amd-smi set -o ppt0 1300`\n\tPPT0 min cap: {ppt0_power_cap_min}, PPT0 max cap: {ppt0_power_cap_max}\n\tPPT1 min cap: {ppt1_power_cap_min}, PPT1 max cap: {ppt1_power_cap_max}" + set_clk_limit_help = "Sets the sclk (aka gfxclk) or mclk minimum and maximum frequencies. \n\tex: amd-smi set -L (sclk | mclk) (min | max) value" + set_process_isolation_help = "Enable or disable the GPU process isolation on a per partition basis:\n 0 for disable and 1 for enable.\n" + + # Help text for CPU set options + set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." + set_cpu_xgmi_link_width_help = "Set max and Min linkwidth. Input parameters are min and max link width values" + set_cpu_lclk_dpm_level_help = "Sets the max and min dpm level on a given NBIO.\ + \n Input parameters are die_index, min dpm, max dpm." + set_cpu_pwr_eff_mode_help = "Sets the power efficency mode policy. Input parameter is mode." + set_cpu_gmi3_link_width_help = "Sets max and min gmi3 link width range" + set_cpu_pcie_link_rate_help = "Sets pcie link rate" + set_cpu_df_pstate_range_help = "Sets max and min df-pstates" + set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" + set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" + set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" + + # Help text for CPU Core set options + set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" + + # Create set_value subparser + set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) + set_value_parser._optionals.title = set_value_optionals_title + set_value_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + set_value_parser.set_defaults(func=func) + + if self.helpers.is_amdgpu_initialized(): + # set value should only take one of these at a time so args below will be mutually exclusive + set_value_exclusive_group = set_value_parser.add_mutually_exclusive_group() + if self.helpers.is_baremetal(): + # Optional GPU Args + set_value_exclusive_group.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') + set_value_exclusive_group.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') + set_value_exclusive_group.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='PROFILE_LEVEL') + set_value_exclusive_group.add_argument('-d', '--perf-determinism', action='store', type=lambda value: self._not_negative_int(value, '--perf-determinism'), required=False, help=set_perf_det_help, metavar='SCLKMAX') + set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), + required=False, help=set_compute_partition_help, metavar=('TYPE/INDEX')) + set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') + # Power cap is enabled on guest, maintain order + set_value_exclusive_group.add_argument('-o', '--power-cap', action=self._power_cap_options(), nargs=2, required=False, help=set_power_cap_help, metavar=('PWR_TYPE', 'WATTS')) + if self.helpers.is_baremetal(): + set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') + set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'PERF_LEVELS')) + + set_value_exclusive_group.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) + set_value_exclusive_group.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--process-isolation'), required=False, help=set_process_isolation_help, metavar='STATUS') + + if self.helpers.is_amd_hsmp_initialized(): + if self.helpers.is_baremetal(): + # Optional CPU Args + cpu_group = set_value_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('--cpu-pwr-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("PWR_LIMIT"), help=set_cpu_pwr_limit_help) + cpu_group.add_argument('--cpu-xgmi-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._not_negative_int, nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"), help=set_cpu_lclk_dpm_level_help) + cpu_group.add_argument('--cpu-pwr-eff-mode', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) + cpu_group.add_argument('--cpu-gmi3-link-width', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) + cpu_group.add_argument('--cpu-pcie-link-rate', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_link_rate_help) + cpu_group.add_argument('--cpu-df-pstate-range', action='append', required=False, type=self._not_negative_int, nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) + cpu_group.add_argument('--cpu-enable-apb', action='store_true', required=False, help=set_cpu_enable_apb_help) + cpu_group.add_argument('--cpu-disable-apb', action='append', required=False, type=self._not_negative_int, nargs=1, metavar=("DF_PSTATE"), help=set_cpu_disable_apb_help) + cpu_group.add_argument('--soc-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) + + # Optional CPU Core Args + core_group = set_value_parser.add_argument_group("CPU Core Arguments") + core_group.add_argument('--core-boost-limit', action='append', required=False, type=self._positive_int, nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) + + # Set accepts default devices of all + self._add_device_arguments(set_value_parser, required=False) + # Add Universal Arguments + self._add_command_modifiers(set_value_parser) + + + def _add_reset_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_linux(): + # This subparser is only applicable to Linux + return + + if not self.helpers.is_amdgpu_initialized(): + # The reset subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + reset_help = "Reset options for devices" + reset_subcommand_help = f"{self.description}\n\nIf no GPU is specified, will select all GPUs on the system.\ + \nA reset argument must be provided; Multiple reset arguments are accepted.\ + \nRequires 'sudo' privileges." + reset_optionals_title = "Reset Arguments" + + # Help text for Arguments only on Guest and BM platforms + gpureset_help = "Reset the specified GPU" + reset_clocks_help = "Reset clocks and overdrive to default" + reset_fans_help = "Reset fans to automatic (driver) control" + reset_profile_help = "Reset power profile back to default" + reset_xgmierr_help = "Reset XGMI error counts" + reset_perf_det_help = "Disable performance determinism" + reset_power_cap_help = "Reset the PPT0 and PPT1 power capacity limit to max capable" + reset_gpu_clean_local_data_help = "Clean up local data in LDS/GPRs on a per partition basis" + reset_gpu_driver_help = "Reset (reload) AMD GPU driver" + + # Create reset subparser + reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) + reset_parser._optionals.title = reset_optionals_title + reset_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + reset_parser.set_defaults(func=func) + + # make reset args mutually exclusive + reset_exclusive_group = reset_parser.add_mutually_exclusive_group() + + if self.helpers.is_baremetal(): + # Add Baremetal reset arguments + reset_exclusive_group.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help) + reset_exclusive_group.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help) + reset_exclusive_group.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help) + reset_exclusive_group.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help) + reset_exclusive_group.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help) + reset_exclusive_group.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help) + reset_exclusive_group.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) + reset_exclusive_group.add_argument('-r', '--reload-driver', action='store_true', required=False, help=reset_gpu_driver_help) + + # Add Baremetal and Virtual OS reset arguments + reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help) + + # Reset accepts default devices of all + self._add_device_arguments(reset_parser, required=False) + # Add Universal Arguments + self._add_command_modifiers(reset_parser) + + + def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_linux(): + # This subparser is only applicable to Linux + return + + if not self.helpers.is_amdgpu_initialized(): + # The monitor subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + monitor_help = "Monitor metrics for target devices" + monitor_subcommand_help = f"{self.description}\n\nMonitor a target device for the specified arguments.\ + \nIf no arguments are provided, all arguments will be enabled.\ + \nUse the watch arguments to run continuously." + monitor_optionals_title = "Monitor Arguments" + + # Help text for Arguments only on Guest and BM platforms + power_usage_help = "Monitor power usage and power cap in Watts" + temperature_help = "Monitor temperature in Celsius" + gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)" + mem_util_help = "Monitor memory utilization (%%) and clock (MHz)" + encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)" + decoder_util_help = "Monitor decoder utilization (%%) and clock (MHz)" + ecc_help = "Monitor ECC single bit, ECC double bit, and PCIe replay error counts" + mem_usage_help = "Monitor memory usage in MB" + pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" + process_help = "Enable Process information table below monitor output;\n Process Name may require elevated permissions" + violation_help = "Monitor power and thermal violation status (%%);\n Only available for MI300 or newer ASICs" + + # Create monitor subparser + monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help, aliases=["dmon"]) + monitor_parser._optionals.title = monitor_optionals_title + monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + monitor_parser.set_defaults(func=func) + + # Add monitor arguments + monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help) + monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help) + monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) + monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help) + monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help) + monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) + monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) + monitor_parser.add_argument('-q', '--process', action='store_true', required=False, help=process_help) + if not self.helpers.is_virtual_os(): + monitor_parser.add_argument('-V', '--violation', action='store_true', required=False, help=violation_help) + + # Add Universal Arguments & Watch Args + self._add_watch_arguments(monitor_parser) + self._add_device_arguments(monitor_parser, required=False) + self._add_command_modifiers(monitor_parser) + + + def _add_xgmi_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The xgmi subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + xgmi_help = "Displays xgmi information of the devices" + xgmi_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ + \nIf no xgmi argument is provided, all xgmi information will be displayed." + xgmi_optionals_title = "XGMI arguments" + + # Help text for Arguments only on Guest and BM platforms + metrics_help = "Metric XGMI information" + xgmi_source_status_help = "Source GPU XGMI Link information" + xgmi_link_status_help = "XGMI Link Status information" + + # Create xgmi subparser + xgmi_parser = subparsers.add_parser('xgmi', help=xgmi_help, description=xgmi_subcommand_help) + xgmi_parser._optionals.title = xgmi_optionals_title + xgmi_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + xgmi_parser.set_defaults(func=func) + + # Optional Args + xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help) + xgmi_parser.add_argument('-s', '--source-status', action='store_true', required=False, help=xgmi_source_status_help) + xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help) + + # Add Universal Arguments + self._add_device_arguments(xgmi_parser, required=False) + self._add_command_modifiers(xgmi_parser) + + + def _add_partition_parser(self, subparsers: argparse._SubParsersAction, func): + if not self.helpers.is_amdgpu_initialized(): + # The partition subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + partition_help = "Displays partition information of the devices" + partition_subcommand_help = f"{self.description}\n\nIf no GPU is specified, returns information for all GPUs on the system.\ + \nIf no partition argument is provided, all partition information will be displayed." + partition_optionals_title = "Partition arguments" + + # Options help text + current_help = "display the current partition information" + memory_help = "display the current memory partition mode and capabilities" + accelerator_help = "display accelerator partition information" + + # Create partition subparser + partition_parser = subparsers.add_parser('partition', help=partition_help, description=partition_subcommand_help) + partition_parser._optionals.title = partition_optionals_title + partition_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + partition_parser.set_defaults(func=func) + + # Handle GPU Options + partition_parser.add_argument('-c', '--current', action='store_true', required=False, help=current_help) + partition_parser.add_argument('-m', '--memory', action='store_true', required=False, help=memory_help) + partition_parser.add_argument('-a', '--accelerator', action='store_true', required=False, help=accelerator_help) + + # Add Universal Arguments + self._add_device_arguments(partition_parser, required=False) + self._add_command_modifiers(partition_parser) + + + def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func): + """ + Adds the 'ras' subcommand. + + Expected command: + amd-smi ras --cper --severity=nonfatal-uncorrected,fatal --folder --file-limit=1000 --follow + + All parameters are provided via options; no positional arguments or optional --file/--gpu are used. + """ + # Subparser help text + ras_help = "Retrieve RAS (CPER) entries from the driver" + ras_description = ( + f"{self.description}\n\n" + "Retrieve and decode RAS (CPER) entries from the kernel driver.\n" + "Supports filtering by severity, exporting to different formats, and continuous monitoring.\n" + "This command accepts options only; no positional arguments are required." + ) + ras_optionals_title = "RAS arguments" + + # Help text for RAS arguments + cper_help = "Trigger current CPER data retrieval" + afid_help = "Generate an AFID (AMD Field ID) given a CPER record file" + severity_choices = ["nonfatal-uncorrected", "fatal", "nonfatal-corrected", "all"] + severity_choices_str = ", ".join(severity_choices) + severity_help = f"Set the SEVERITY filters from the following:\n {severity_choices_str}" + folder_help = "Folder to dump current CPER report files" + file_limit_help = "Maximum number of current CPER files in target folder\n Older files beyond limit will be deleted" + cper_file_help = "Full path of a retrieved CPER record file to generate the AFID" + follow_help = "Continuously monitor for new CPER entries" + + ras_parser = subparsers.add_parser("ras", help=ras_help, description=ras_description) + ras_parser._optionals.title = ras_optionals_title + ras_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) + ras_parser.set_defaults(func=func) + + # Create mutually exclusive command ras group (--cper or --afid) + ras_exclusive_group = ras_parser.add_mutually_exclusive_group(required=True) + ras_exclusive_group.add_argument("--cper", action="store_true", help=cper_help) + ras_exclusive_group.add_argument("--afid", action="store_true", help=afid_help) + + # CPER Arguments remove defaults + cper_group = ras_parser.add_argument_group("CPER Arguments") + cper_group.add_argument("--severity", type=str.lower, nargs='+', default=['all'], help=severity_help, choices=severity_choices, metavar='SEVERITY') + cper_group.add_argument("--folder", type=str, action=self._check_folder_path(), help=folder_help) + cper_group.add_argument("--file-limit", type=self._positive_int, action='store', help=file_limit_help) + cper_group.add_argument("--follow", action="store_true", help=follow_help) + + # AFID Arguments + afid_group = ras_parser.add_argument_group("AFID Arguments") + afid_group.add_argument("--cper-file", action=self._check_cper_file_path(), metavar="CPER_FILE", help=cper_file_help) + + # Add common modifiers and device selection arguments. + self._add_device_arguments(ras_parser, required=False) + self._add_command_modifiers(ras_parser) + + + def _add_node_parser(self, subparsers: argparse._SubParsersAction, func): + if self.helpers.is_virtual_os(): + # This subparser is only available to Guest and Hypervisor systems + return + + # Subparser help text + node_help = "Gets power information for the node" + node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\ + \nIf no node argument is provided, all node information will be displayed." + node_optionals_title = "Node arguments" + + # Help text for Node arguments + power_management_help = "Displays power management information" + + node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help) + node_parser._optionals.title = node_optionals_title + node_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) + node_parser.set_defaults(func=func) + + # Optional Args + node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help) + + # Add Universal Arguments + self._add_command_modifiers(node_parser) + + + def error(self, message): + outputformat = self.helpers.get_output_format() + + if "argument : invalid choice: " in message: + l = len("argument : invalid choice: ") + 1 + message = message[l:] + message = message.split("'")[0] + # Check if the command is possible in other system configurations and error accordingly + if message in self.possible_commands: + raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(message, outputformat) + raise amdsmi_cli_exceptions.AmdSmiInvalidCommandException(message, outputformat) + elif "unrecognized arguments: " in message: + l = len("unrecognized arguments: ") + message = message[l:] + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterException(sys.argv[1], message, outputformat) + else: + print(message) diff --git a/projects/amdsmi/cmake_modules/help_package.cmake b/projects/amdsmi/cmake_modules/help_package.cmake new file mode 100644 index 0000000000..1d235463f8 --- /dev/null +++ b/projects/amdsmi/cmake_modules/help_package.cmake @@ -0,0 +1,143 @@ +# This module provides common functions used for building +# and packaging ROCm projects + +option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" ON) +option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON) + +function(generic_add_rocm) + set(ROCM_DIR "/opt/rocm" CACHE STRING "ROCm directory.") + if(DEFINED ENV{ROCM_RPATH} AND NOT DEFINED LIB_RUNPATH) + set(LIB_RUNPATH "\$ORIGIN:\$ORIGIN/../lib:\$ORIGIN/../lib64" PARENT_SCOPE) + endif() + + set(CMAKE_INSTALL_PREFIX ${ROCM_DIR} CACHE STRING "Default installation directory.") + set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix.") + # add package search paths + set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} /usr/local PARENT_SCOPE) + set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/lib64 /usr/lib/x86_64-linux-gnu PARENT_SCOPE) +endfunction() + +function(generic_package) + # Used by test and example CMakeLists + set(SHARE_INSTALL_PREFIX "share/${CMAKE_PROJECT_NAME}" CACHE STRING "Tests and Example install directory") + + if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0) + message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION}) + message(FATAL_ERROR "Require at least gcc-5.4.0") + endif() + + if("${CMAKE_BUILD_TYPE}" STREQUAL Release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" PARENT_SCOPE) + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG" PARENT_SCOPE) + endif() + + # Add address sanitizer + # derived from: + # https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/blob/e176056061bf11fdd98b58dd57deb4ac5625844d/amdocl/CMakeLists.txt#L27 + if(${ADDRESS_SANITIZER}) + set(ASAN_COMPILER_FLAGS "-fno-omit-frame-pointer -fsanitize=address") + set(ASAN_LINKER_FLAGS "-fsanitize=address") + + if(BUILD_SHARED_LIBS) + # Clang-specific flag for shared ASAN library + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(ASAN_COMPILER_FLAGS "${ASAN_COMPILER_FLAGS} -shared-libsan") + set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -shared-libsan") + endif() + else() + set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -static-libsan") + endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ASAN_COMPILER_FLAGS}" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ASAN_COMPILER_FLAGS}" PARENT_SCOPE) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LINKER_FLAGS}" PARENT_SCOPE) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LINKER_FLAGS}" PARENT_SCOPE) + else() + ## Security breach mitigation flags + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFORTIFY_SOURCE=2 -fstack-protector-all -Wcast-align" PARENT_SCOPE) + ## More security breach mitigation flags + set(HARDENING_LDFLAGS "${HARDENING_LDFLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${HARDENING_LDFLAGS}" PARENT_SCOPE) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HARDENING_LDFLAGS}" PARENT_SCOPE) + + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-Wtrampolines" CXX_SUPPORTS_WTRAMPOLINES) + if(CXX_SUPPORTS_WTRAMPOLINES) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtrampolines" PARENT_SCOPE) + endif() + endif() + + # Clang does not set the build-id + # similar to if(NOT CMAKE_COMPILER_IS_GNUCC) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--build-id=sha1" PARENT_SCOPE) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--build-id=sha1" PARENT_SCOPE) + endif() + + # configure packaging + # cpack version is populated with CMAKE_PROJECT_VERSION implicitly + set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME} CACHE STRING "") + set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." CACHE STRING "") + set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix.") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" CACHE STRING "") + set(CPACK_RPM_PACKAGE_LICENSE "MIT" CACHE STRING "") + set(CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators.") + set(CPACK_VERBATIM_VARIABLES ON CACHE BOOL "Escape strings passed to CPACK.") + set(CPACK_DEB_COMPONENT_INSTALL ON PARENT_SCOPE) + set(CPACK_RPM_COMPONENT_INSTALL ON PARENT_SCOPE) + mark_as_advanced(CPACK_PACKAGE_NAME CPACK_PACKAGE_VENDOR CPACK_PACKAGE_CONTACT CPACK_RESOURCE_FILE_LICENSE + CPACK_RPM_PACKAGE_LICENSE CPACK_GENERATOR) + + # Debian package specific variables + if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) + set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} PARENT_SCOPE) + else() + set(CPACK_DEBIAN_PACKAGE_RELEASE "local" PARENT_SCOPE) + endif() + message("Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}") + set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" PARENT_SCOPE) + + # RPM package specific variables + if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE}) + set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} PARENT_SCOPE) + else() + set(CPACK_RPM_PACKAGE_RELEASE "local" PARENT_SCOPE) + endif() + message("Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}") + set(CPACK_RPM_FILE_NAME "RPM-DEFAULT" PARENT_SCOPE) + set(CPACK_RPM_PACKAGE_AUTOREQ 0 PARENT_SCOPE) + set(CPACK_RPM_PACKAGE_AUTOPROV 1 PARENT_SCOPE) + list( + APPEND + CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION + "/lib" + "/usr/sbin" + "/lib/systemd" + "/lib/systemd/system" + "/usr" + "/opt") + + # PACKAGE-tests need PACKAGE + set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS "${CPACK_PACKAGE_NAME}" PARENT_SCOPE) + set(CPACK_RPM_TESTS_PACKAGE_REQUIRES "${CPACK_PACKAGE_NAME}" PARENT_SCOPE) + + # Treat runtime group as package base. + # Without it - the base package would be named 'rdc-runtime' + # resulting in rdc-runtime*.deb and rdc-runtime*.rpm + set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}" PARENT_SCOPE) + set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}" PARENT_SCOPE) +endfunction() + +# this function goes after 'include(CPack)' +function(generic_package_post) + # PACKAGE package, no postfix + cpack_add_component_group("runtime") + cpack_add_component(dev GROUP runtime DESCRIPTION "Development components of the library") + cpack_add_component(unspecified GROUP runtime) + # not quite sure why this is the only way to populate cpack description + cpack_add_component(runtime GROUP runtime DESCRIPTION "Runtime components of the library") + + # PACKAGE-tests package, -tests postfix + cpack_add_component_group("tests") + cpack_add_component(tests GROUP tests DESCRIPTION "Test components of the library") +endfunction() diff --git a/projects/amdsmi/cmake_modules/utils.cmake b/projects/amdsmi/cmake_modules/utils.cmake new file mode 100644 index 0000000000..f843874ba8 --- /dev/null +++ b/projects/amdsmi/cmake_modules/utils.cmake @@ -0,0 +1,200 @@ +################################################################################ +## Copyright (C) Advanced Micro Devices. All rights reserved. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +## the Software, and to permit persons to whom the Software is furnished to do so, +## subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +## FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +## COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +## IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +## CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +## Parses the VERSION_STRING variable and places +## the first, second and third number values in +## the major, minor and patch variables. +function(parse_version VERSION_STRING) + + string(FIND ${VERSION_STRING} "-" STRING_INDEX) + + if(${STRING_INDEX} GREATER -1) + math(EXPR STRING_INDEX "${STRING_INDEX} + 1") + string(SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD) + endif() + + string(REGEX MATCHALL "[0-9]+" VERSIONS ${VERSION_STRING}) + list(LENGTH VERSIONS VERSION_COUNT) + + if(${VERSION_COUNT} GREATER 0) + list(GET VERSIONS 0 MAJOR) + set(VERSION_MAJOR ${MAJOR} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${MAJOR}") + endif() + + if(${VERSION_COUNT} GREATER 1) + list(GET VERSIONS 1 MINOR) + set(VERSION_MINOR ${MINOR} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}") + endif() + + if(${VERSION_COUNT} GREATER 2) + list(GET VERSIONS 2 PATCH) + set(VERSION_PATCH ${PATCH} PARENT_SCOPE) + set(TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}") + endif() + + set(VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE) +endfunction() + +function(get_version_from_file REL_FILE_PATH ITEM) + set(FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${REL_FILE_PATH}") + set(OUTPUT_ITEM "0") + + if(EXISTS "${FILE_PATH}") + file(READ ${FILE_PATH} file_contents) + string(REGEX MATCHALL "AMDSMI_LIB_VERSION_${ITEM} *[0-9]+" OUTPUT_STR "${file_contents}") + list(LENGTH OUTPUT_STR OUTPUT_STR_LENGTH) + if(${OUTPUT_STR_LENGTH} GREATER 0) + string(REGEX MATCH "[0-9]+" OUTPUT_ITEM "${OUTPUT_STR}") + endif() + endif() + + set(${ITEM} "${OUTPUT_ITEM}" PARENT_SCOPE) +endfunction() + +# Parses file for a pattern and replaces the value +# associated with that pattern with a specified value +# Replaces VERSION(MAJOR.MINOR.RELEASE) with updated values +function(update_version_in_file REL_FILE_PATH DEFAULT_VERSION PAT1 PAT2 PAT3) + get_version_from_file(${REL_FILE_PATH} "MAJOR") + get_version_from_file(${REL_FILE_PATH} "MINOR") + get_version_from_file(${REL_FILE_PATH} "RELEASE") + set(FILE_VERSION "${MAJOR}.${MINOR}.${RELEASE}") + + if(DEFAULT_VERSION VERSION_GREATER FILE_VERSION) + set(FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${REL_FILE_PATH}") + if(EXISTS "${FILE_PATH}") + parse_version(${DEFAULT_VERSION}) + file(READ ${FILE_PATH} file_contents_new) + + string(REGEX REPLACE "${PAT1}MAJOR${PAT2} *[0-9]*" "${PAT1}MAJOR${PAT3}${VERSION_MAJOR}" file_contents + "${file_contents_new}") + string(REGEX REPLACE "${PAT1}MINOR${PAT2} *[0-9]*" "${PAT1}MINOR${PAT3}${VERSION_MINOR}" file_contents_new + "${file_contents}") + string(REGEX REPLACE "${PAT1}RELEASE${PAT2} *[0-9]*" "${PAT1}RELEASE${PAT3}${VERSION_PATCH}" file_contents + "${file_contents_new}") + + file(WRITE ${FILE_PATH} "${file_contents}") + endif() + set(VERSION_STRING "${DEFAULT_VERSION}" PARENT_SCOPE) + else() + set(VERSION_STRING "${FILE_VERSION}" PARENT_SCOPE) + endif() +endfunction() + +## Gets the current version of the repository +## using versioning tags and git describe. +## Passes back a packaging version string +## and a library version string. +function(get_version_from_tag DEFAULT_VERSION_STRING VERSION_PREFIX GIT) + parse_version(${DEFAULT_VERSION_STRING}) + set(DEFAULT_VERSION_MAJOR "${VERSION_MAJOR}") + set(DEFAULT_VERSION_MINOR "${VERSION_MINOR}") + set(DEFAULT_VERSION_PATCH "${VERSION_PATCH}") + + if(GIT) + execute_process( + COMMAND git tag --list --sort=-version:refname "${VERSION_PREFIX}*" + COMMAND head -n 1 + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GIT_TAG_STRING + OUTPUT_STRIP_TRAILING_WHITESPACE RESULTS_VARIABLE RESULTS) + if(GIT_TAG_STRING) + parse_version(${GIT_TAG_STRING}) + endif() + endif() + + if(VERSION_STRING VERSION_GREATER DEFAULT_VERSION_STRING) + set(VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE) + set(VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE) + set(VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE) + set(VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE) + else() + set(VERSION_STRING "${DEFAULT_VERSION_STRING}" PARENT_SCOPE) + set(VERSION_MAJOR "${DEFAULT_VERSION_MAJOR}" PARENT_SCOPE) + set(VERSION_MINOR "${DEFAULT_VERSION_MINOR}" PARENT_SCOPE) + set(VERSION_PATCH "${DEFAULT_VERSION_PATCH}" PARENT_SCOPE) + endif() +endfunction() + +function(num_change_since_prev_pkg VERSION_PREFIX) + find_program(get_commits NAMES version_util.sh PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules) + if(get_commits) + execute_process( + COMMAND ${get_commits} -c ${VERSION_PREFIX} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE NUM_COMMITS + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT) + + set(NUM_COMMITS "${NUM_COMMITS}" PARENT_SCOPE) + + if(${RESULT} EQUAL 0) + message("${NUM_COMMITS} were found since previous release") + else() + message("Unable to determine number of commits since previous release") + endif() + else() + message("WARNING: Didn't find version_util.sh") + set(NUM_COMMITS "unknown" PARENT_SCOPE) + endif() +endfunction() + +function(get_package_version_number DEFAULT_VERSION_STRING VERSION_PREFIX GIT) + parse_version(${DEFAULT_VERSION_STRING}) + num_change_since_prev_pkg(${VERSION_PREFIX}) + set(PKG_VERSION_STR "${VERSION_STRING}.${NUM_COMMITS}") + if(DEFINED ENV{ROCM_BUILD_ID}) + set(VERSION_ID $ENV{ROCM_BUILD_ID}) + else() + set(VERSION_ID "local-build-0") + endif() + + set(PKG_VERSION_STR "${PKG_VERSION_STR}-${VERSION_ID}") + + if(GIT) + execute_process( + COMMAND git rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE VERSION_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT) + if(${RESULT} EQUAL 0) + # Check for dirty workspace. + execute_process(COMMAND git diff --quiet WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE RESULT) + if(${RESULT} EQUAL 1) + set(VERSION_HASH "${VERSION_HASH}-dirty") + endif() + else() + set(VERSION_HASH "unknown") + endif() + else() + set(VERSION_HASH "unknown") + endif() + set(PKG_VERSION_STR "${PKG_VERSION_STR}-${VERSION_HASH}") + set(PKG_VERSION_STR ${PKG_VERSION_STR} PARENT_SCOPE) + set(PKG_VERSION_HASH ${VERSION_HASH} PARENT_SCOPE) + set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR} PARENT_SCOPE) + set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR} PARENT_SCOPE) + set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH} PARENT_SCOPE) +endfunction() diff --git a/projects/amdsmi/cmake_modules/version_util.sh b/projects/amdsmi/cmake_modules/version_util.sh new file mode 100755 index 0000000000..5c1ded9631 --- /dev/null +++ b/projects/amdsmi/cmake_modules/version_util.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Handle commandline args +while [ "$1" != "" ]; do + case $1 in + -c ) # Commits since prevous tag + TARGET="count" ;; + * ) + TARGET="count" + break ;; + esac + shift 1 +done +TAG_PREFIX=$1 +reg_ex="${TAG_PREFIX}*" + +commits_since_last_tag() { + TAG_ARR=(`git tag --sort=committerdate -l ${reg_ex} | tail -2`) + PREVIOUS_TAG=${TAG_ARR[0]} + CURRENT_TAG=${TAG_ARR[1]} + + PREV_CMT_NUM=`git rev-list --count $PREVIOUS_TAG` + CURR_CMT_NUM=`git rev-list --count $CURRENT_TAG` + + # Commits since prevous tag: + if [[ -z $PREV_CMT_NUM || -z $CURR_CMT_NUM ]]; then + let NUM_COMMITS="0" + else + let NUM_COMMITS="${CURR_CMT_NUM}-${PREV_CMT_NUM}" + fi + echo $NUM_COMMITS +} + +case $TARGET in + count) commits_since_last_tag ;; + *) die "Invalid target $target" ;; +esac + +exit 0 + diff --git a/projects/amdsmi/docs/.gitignore b/projects/amdsmi/docs/.gitignore new file mode 100644 index 0000000000..9095d45c48 --- /dev/null +++ b/projects/amdsmi/docs/.gitignore @@ -0,0 +1,14 @@ +!.sphinx/ +!.doxygen/ +/_build/ +/_doxygen/ +/_images/ +/_static/ +/_templates/ +/html/ +/latex/ +404.md +data/AMD-404.png + +# file below is overwritten by sphinx script! +./esmi_lib_readme_link.md diff --git a/projects/amdsmi/docs/_extension/go_api_ref.py b/projects/amdsmi/docs/_extension/go_api_ref.py new file mode 100644 index 0000000000..31ef03124e --- /dev/null +++ b/projects/amdsmi/docs/_extension/go_api_ref.py @@ -0,0 +1,296 @@ +# +# Copyright (C) Advanced Micro Devices. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import re +import os +from pathlib import Path + +from docutils import nodes +from docutils.parsers.rst import Directive, directives +from sphinx.application import Sphinx +from sphinx.util.typing import ExtensionMetadata + + +class GoApiRefDirective(Directive): + """ + Directive for generating Go API reference documentation. + + Usage: + .. go-api-ref:: path/to/gofile.go + :section: gpu + """ + + required_arguments = 1 # Requires one argument: the path to the Go file + optional_arguments = 0 + has_content = False + option_spec = { + "section": directives.unchanged, # Optional section filter + } + + def run(self): + # Get the path to the Go file + go_file_path = self.arguments[0] + env = self.state.document.settings.env + + # Get the section filter if provided + section_filter = self.options.get("section", None) + + # Resolve the path relative to the document + doc_dir = Path(env.doc2path(env.docname)).parent + source_path = (doc_dir / go_file_path).resolve() + + # Check if the file exists + if not source_path.exists(): + msg = f"Go source file not found: {source_path}" + return [nodes.warning("", nodes.paragraph("", msg))] + + # Parse the Go file and generate documentation + functions = parse_go_file(str(source_path)) + + # Create a container for the API documentation + container = nodes.container() + container["classes"].append("go-api-reference") + + # Add the API documentation to the container + content = generate_rst_content(functions, section_filter) + self.state_machine.insert_input(content, source=str(source_path)) + + return [container] + + +def parse_go_file(file_path): + """Parse a Go file and extract function documentation.""" + with open(file_path, "r") as f: + content = f.read() + + # Pattern to match function documentation and definition + pattern = r"(\/\/[^\n]*(?:\n\/\/[^\n]*)*)\n\s*func\s+([A-Za-z0-9_]+)\s*\((.*?)\)\s*(\(.*?\)|\w+)\s*\{" + matches = re.findall(pattern, content, re.DOTALL) + + functions = [] + for match in matches: + doc_comment = match[0] + func_name = match[1] + params = match[2].strip() + return_type = match[3].strip() + + # Process the comment lines + doc_lines = [] + for line in doc_comment.split("\n"): + if line.strip().startswith("//"): + # Remove the comment marker and one space after it (if present) + comment_text = line.strip()[2:] + if comment_text.startswith(" "): + comment_text = comment_text[1:] + doc_lines.append(comment_text) + + # Extract sections from the doc comment + description = [] + input_params = [] + output_params = [] + example = [] + + current_section = "description" + + for line in doc_lines: + if line.startswith("Input parameter"): + current_section = "input" + input_params.append(line) + elif line.startswith("Output:"): + current_section = "output" + output_params.append(line) + elif line.startswith("Example:"): + current_section = "example" + example.append(line) + elif current_section == "description": + description.append(line) + elif current_section == "input": + input_params.append(line) + elif current_section == "output": + output_params.append(line) + elif current_section == "example": + example.append(line) + + # Combine description lines into a single line + desc_text = " ".join([line.strip() for line in description if line.strip()]) + + # Combine output lines into a single line + output_text = " ".join([line.strip() for line in output_params if line.strip()]) + + # Determine the section based on function name + parts = func_name.split("_") + section = parts[1] if len(parts) > 1 else "other" + + functions.append( + { + "name": func_name, + "params": params, + "return_type": return_type, + "description": desc_text, + "input_params": "\n".join(input_params).strip(), + "output_params": output_text, + "example": "\n".join(example).strip(), + "section": section.lower(), # Store the section for filtering + } + ) + + return functions + + +def generate_rst_content(functions, section_filter=None): + """Generate reStructuredText content from parsed functions.""" + lines = [] + + # Filter functions by section if a filter is provided + if section_filter: + section_filter = section_filter.lower() + functions = [f for f in functions if f["section"] == section_filter] + + if not functions: + lines.append(f"No functions found in section: {section_filter}") + return lines + + # Group functions by prefix if no section filter is provided + if not section_filter: + # Group functions by prefix (e.g., GO_gpu_, GO_cpu_) + function_groups = {} + for func in functions: + section = func["section"] + if section not in function_groups: + function_groups[section] = [] + function_groups[section].append(func) + + # Define the order of sections (GPU first, then CPU, then others) + section_order = [] + + # Add GPU section first if it exists + if "gpu" in function_groups: + section_order.append("gpu") + + # Add CPU section next if it exists + if "cpu" in function_groups: + section_order.append("cpu") + + # Add all other sections in alphabetical order + for prefix in sorted(function_groups.keys()): + if prefix not in ["gpu", "cpu"]: + section_order.append(prefix) + + # Write each group in the specified order + for section in section_order: + funcs = function_groups[section] + lines.append(f"{section.upper()} Functions") + lines.append("-" * len(f"{section.upper()} Functions")) + lines.append("") + + for func in funcs: + add_function_documentation(lines, func) + else: + # If a section filter is provided, just document those functions without section headers + for func in functions: + add_function_documentation(lines, func) + + return lines + + +def add_function_documentation(lines, func): + """Add documentation for a single function to the lines list.""" + lines.append(func['name']) + lines.append("~" * len(f"``{func['name']}``")) + lines.append("") + + # Function signature + return_type = func["return_type"] + if return_type.startswith("(") and return_type.endswith(")"): + return_type = return_type[1:-1] + + lines.append(".. code-block:: go") + lines.append("") + lines.append(f" func {func['name']}({func['params']}) {return_type}") + lines.append("") + + # Description + if func["description"]: + lines.append(func["description"]) + lines.append("") + + # Input parameters + if func["input_params"]: + for input_line in func["input_params"].split("\n"): + lines.append(input_line) + lines.append("") + + # Output parameters + if func["output_params"]: + lines.append(func["output_params"]) + lines.append("") + + # Example + if func["example"]: + # Process the example to properly format code blocks + example_lines = func["example"].split("\n") + in_code_block = False + + for i, line in enumerate(example_lines): + stripped_line = line.strip() + + # Check if this is the Example: line + if stripped_line == "Example:": + lines.append("Example:") + continue + + # Check if we're entering a code block + if ( + not in_code_block + and i > 0 + and ( + stripped_line.startswith("import") + or stripped_line.startswith("if") + or stripped_line.startswith("for") + ) + ): + in_code_block = True + lines.append("") + lines.append(".. code-block:: go") + lines.append("") + + # Add the line to the formatted example + if in_code_block: + # For code blocks, add indentation + lines.append(f" {line}") + elif stripped_line: # Only add non-empty lines outside code blocks + lines.append(line) + + lines.append("") + + +def setup(app): + """ + Setup function for Sphinx extension. + This will be called by Sphinx when the extension is loaded. + """ + # Register the directive + app.add_directive("go-api-ref", GoApiRefDirective) + + return { + "version": "0.1.0", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/projects/amdsmi/docs/conceptual/ras.md b/projects/amdsmi/docs/conceptual/ras.md new file mode 100644 index 0000000000..bfde30683b --- /dev/null +++ b/projects/amdsmi/docs/conceptual/ras.md @@ -0,0 +1,95 @@ +--- +myst: + html_meta: + "description lang=en": "AMD SMI for reliability, availability, serviceability." + "keywords": "system, management, interface, cper, log, error, spec, ecc, afid, fault, ras" +--- + +# Reliability, availability, serviceability (RAS) + +RAS aims to increase the robustness of a system by detecting hardware errors, recording them, and +correcting them where possible. See [Reliability, availability, serviceability (Linux +kernel)](https://docs.kernel.org/admin-guide/RAS/main.html) for more general information. + +## ECC + +ECC (Error-Correcting Code) is a type of memory to automatically detect errors. Correctable 1-bit +errors are handled by the ECC logic and logged by the hardware. Uncorrectable 2-bit errors can be +detected but not reliably fixed; this is a more serious event that must be reported. See [RAS Error +Count sysfs Interface](https://docs.kernel.org/gpu/amdgpu/ras.html#ras-error-count-sysfs-interface) +to learn how AMD SMI accesses error counts. + +While ECC is a mechanism to handle different errors, CPER is the standard used to report that the event +occurred. + +## CPER + +At its core, CPER (Common Platform Error Record) is a standard format included in the [UEFI +specification](https://uefi.org/specs/UEFI/2.10/01_Introduction.html) to report errors to the +operating system. It works as a standard error report template that different hardware components +can fill out when something goes wrong. It consists of a header, one or more section descriptors -- +and for each descriptor, an associated section containing error or informational data. See [CPER +(UEFI Specification)](https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html) for +more information. + +A CPER record consists of vital information for diagnostics such as: + +- Error source +- Error type +- Error severity + - 0 - Recoverable (also called non-fatal uncorrected) + - 1 - Fatal + - 2 - Corrected + - 3 - Informational +- Timestamp +- Other data + +A CPER record might contain an AFID in its data to help map a complex error to a more actionable service task. + +## AFID + +AFIDs (AMD Field ID) are unique numerical IDs associated with specific events or errors produced by +AMD Instinct accelerators. It provides a specific identifier for a known condition, which helps +facilitate root cause analysis. Each AFID is associated with category, type, and severity fields. See +[AFID Event List](https://docs.amd.com/r/en-US/AMD_Field_ID_70122_v1.0/AFID-Event-List) for more +information. + +## From concept to action + +AMD SMI provides tools to programmatically monitor and manage these RAS features. + +:::::{tab-set} +::::{tab-item} C/C++ +The AMD SMI library provides APIs to query ECC error counts and manage CPER records +(list, decode, and clear). + +See [ECC information](/doxygen/docBin/html/group__tagECCInfo) and [RAS +information](/doxygen/docBin/html/group__tagRasInfo) for available APIs. +:::: + +::::{tab-item} Python +See related APIs: + +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_ecc_count) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_ecc_enabled) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_ecc_status) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_total_ecc_count) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_cper_entries) +- [](/reference/amdsmi-py-api.md#amdsmi_get_afids_from_cper) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_ras_feature_info) +- [](/reference/amdsmi-py-api.md#amdsmi_get_gpu_ras_block_features_enabled) +:::: + +::::{tab-item} amd-smi CLI +See [`amd-smi ras --help`](/how-to/amdsmi-cli-tool.md#amd-smi-ras) for details and available options. +```shell +amd-smi ras --help +``` +:::: +::::: + +## Further reading + +- [AMD Field ID](https://docs.amd.com/r/en-US/AMD_Field_ID_70122_v1.0/Introduction) +- [CPER (UEFI specification)](https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html) +- [Reliability, availability, serviceability (Linux kernel)](https://docs.kernel.org/admin-guide/RAS/main.html) diff --git a/projects/amdsmi/docs/conf.py b/projects/amdsmi/docs/conf.py new file mode 100644 index 0000000000..72dcc2c05e --- /dev/null +++ b/projects/amdsmi/docs/conf.py @@ -0,0 +1,85 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import re +import sys +from pathlib import Path + +sys.path.append(str(Path("_extension").resolve())) + + +# get version number to print in docs +def get_version_info(filepath): + with open(filepath, "r") as f: + content = f.read() + + version_pattern = ( + r"^#define\s+AMDSMI_LIB_VERSION_MAJOR\s+(\d+)\s*$|" + r"^#define\s+AMDSMI_LIB_VERSION_MINOR\s+(\d+)\s*$|" + r"^#define\s+AMDSMI_LIB_VERSION_RELEASE\s+(\d+)\s*$" + ) + + matches = re.findall(version_pattern, content, re.MULTILINE) + + if len(matches) == 3: + version_major, version_minor, version_release = [ + match for match in matches if any(match) + ] + return version_major[0], version_minor[1], version_release[2] + else: + raise ValueError("Couldn't find all VERSION numbers.") + + +version_major, version_minor, version_release = get_version_info( + "../include/amd_smi/amdsmi.h" +) +version_number = f"{version_major}.{version_minor}.{version_release}" + +# project info +project = "AMD SMI" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +html_theme = "rocm_docs_theme" +html_theme_options = {"flavor": "rocm"} +html_title = f"AMD SMI {version_number} documentation" +suppress_warnings = ["etoc.toctree"] +external_toc_path = "./sphinx/_toc.yml" + +external_projects_current_project = "amdsmi" +extensions = ["rocm_docs", "rocm_docs.doxygen", "go_api_ref"] + +doxygen_root = "doxygen" +doxysphinx_enabled = True +doxygen_project = { + "name": "AMD SMI C++ API reference", + "path": "doxygen/docBin/xml", +} + + +def generate_doxyfile(app, _): + doxyfile_in = Path(app.confdir) / doxygen_root / "Doxyfile.in" + doxyfile_out = Path(app.confdir) / doxygen_root / "Doxyfile" + + if not doxyfile_in.exists(): + from sphinx.errors import ConfigError + + raise ConfigError(f"Missing Doxyfile.in at {doxyfile_in}") + + with open(doxyfile_in) as f: + content = f.read() + + content = content.replace("@PROJECT_NUMBER@", version_number) + + with open(doxyfile_out, "w") as f: + f.write(content) + + +def setup(app): + app.connect("config-inited", generate_doxyfile, priority=100) + return {"parallel_read_safe": True, "parallel_write_safe": True} diff --git a/projects/amdsmi/docs/data/how-to/setup-docker-container/docker-run-example.jpg b/projects/amdsmi/docs/data/how-to/setup-docker-container/docker-run-example.jpg new file mode 100644 index 0000000000..1185283bd5 Binary files /dev/null and b/projects/amdsmi/docs/data/how-to/setup-docker-container/docker-run-example.jpg differ diff --git a/projects/amdsmi/docs/doxygen/.gitignore b/projects/amdsmi/docs/doxygen/.gitignore new file mode 100644 index 0000000000..75bdf1c8c6 --- /dev/null +++ b/projects/amdsmi/docs/doxygen/.gitignore @@ -0,0 +1,2 @@ +docBin/ +Doxyfile diff --git a/projects/amdsmi/docs/doxygen/Doxyfile.in b/projects/amdsmi/docs/doxygen/Doxyfile.in new file mode 100644 index 0000000000..45d7e27d3c --- /dev/null +++ b/projects/amdsmi/docs/doxygen/Doxyfile.in @@ -0,0 +1,2806 @@ +# Doxyfile 1.9.8 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). +# +# Note: +# +# Use doxygen to compare the used configuration file with the template +# configuration file: +# doxygen -x [configFile] +# Use doxygen to compare the used configuration file with the template +# configuration file without replacing the environment variables or CMake type +# replacement variables: +# doxygen -x_noenv [configFile] + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "AMD SMI" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = @PROJECT_NUMBER@ + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "AMD SMI Library API Guide" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 +# sub-directories (in 2 levels) under the output directory of each output format +# and will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to +# control the number of sub-directories. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# Controls the number of sub-directories that will be created when +# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every +# level increment doubles the number of directories, resulting in 4096 +# directories at level 8 which is the default and also the maximum value. The +# sub-directories are organized in 2 levels, the first level always has a fixed +# number of 16 directories. +# Minimum value: 0, maximum value: 8, default value: 8. +# This tag requires that the tag CREATE_SUBDIRS is set to YES. + +CREATE_SUBDIRS_LEVEL = 8 + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, +# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English +# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, +# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with +# English messages), Korean, Korean-en (Korean with English messages), Latvian, +# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, +# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, +# Swedish, Turkish, Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = NO + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:^^" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". Note that you cannot put \n's in the value part of an alias +# to insert newlines (in the resulting output). You can put ^^ in the value part +# of an alias to insert a newline as if a physical newline was in the original +# file. When you need a literal { or } or , in the value part of an alias you +# have to escape them by means of a backslash (\), this can lead to conflicts +# with the commands \{ and \} for these it is advised to use the version @{ and +# @} or use a double escape (\\{ and \\}) + +ALIASES = "platform{1}=\xrefitem platform \"Platform\" \"Platforms\" \1" + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, +# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to +# generate identifiers for the Markdown headings. Note: Every identifier is +# unique. +# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a +# sequence number starting at 0 and GITHUB use the lower case version of title +# with any whitespace replaced by '-' and punctuation characters removed. +# The default value is: DOXYGEN. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +MARKDOWN_ID_STYLE = DOXYGEN + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which effectively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + +# If the TIMESTAMP tag is set different from NO then each generated page will +# contain the date or date and time when the page was generated. Setting this to +# NO can help when comparing the output of multiple runs. +# Possible values are: YES, NO, DATETIME and DATE. +# The default value is: NO. + +TIMESTAMP = NO + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# will also hide undocumented C++ concepts if enabled. This option has no effect +# if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. +# Possible values are: SYSTEM, NO and YES. +# The default value is: SYSTEM. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class +# will show which file needs to be included to use the class. +# The default value is: YES. + +SHOW_HEADERFILE = YES + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = NO + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. See also section "Changing the +# layout of pages" for information. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as documenting some parameters in +# a documented function twice, or documenting parameters that don't exist or +# using markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete +# function parameter documentation. If set to NO, doxygen will accept that some +# parameters have no documentation without warning. +# The default value is: YES. + +WARN_IF_INCOMPLETE_DOC = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong parameter +# documentation, but not about the absence of documentation. If EXTRACT_ALL is +# set to YES then this flag will automatically be disabled. See also +# WARN_IF_INCOMPLETE_DOC +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about +# undocumented enumeration values. If set to NO, doxygen will accept +# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: NO. + +WARN_IF_UNDOC_ENUM_VAL = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves +# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not +# write the warning messages in between other messages but write them at the end +# of a run, in case a WARN_LOGFILE is defined the warning messages will be +# besides being in the defined file also be shown at the end of a run, unless +# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case +# the behavior will remain as with the setting FAIL_ON_WARNINGS. +# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# See also: WARN_LINE_FORMAT +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# In the $text part of the WARN_FORMAT command it is possible that a reference +# to a more specific place is given. To make it easier to jump to this place +# (outside of doxygen) the user can define a custom "cut" / "paste" string. +# Example: +# WARN_LINE_FORMAT = "'vi $file +$line'" +# See also: WARN_FORMAT +# The default value is: at line $line of file $file. + +WARN_LINE_FORMAT = "at line $line of file $file" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). In case the file specified cannot be opened for writing the +# warning and error messages are written to standard error. When as file - is +# specified the warning and error messages are written to standard output +# (stdout). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../reference/index.rst \ + ../../include/amd_smi/amdsmi.h + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. +# See also: INPUT_FILE_ENCODING +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify +# character encoding on a per file pattern basis. Doxygen will compare the file +# name with each pattern and apply the encoding instead of the default +# INPUT_ENCODING) if there is a match. The character encodings are a list of the +# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding +# "INPUT_ENCODING" for further information on supported encodings. + +INPUT_FILE_ENCODING = + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm, +# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, +# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php, +# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be +# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# ANamespace::AClass, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that doxygen will use the data processed and written to standard output +# for further processing, therefore nothing else, like debug statements or used +# commands (so in case of a Windows batch file always use @echo OFF), should be +# written to standard output. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +# The Fortran standard specifies that for fixed formatted Fortran code all +# characters from position 72 are to be considered as comment. A common +# extension is to allow longer lines before the automatic comment starts. The +# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can +# be processed before the automatic comment starts. +# Minimum value: 7, maximum value: 10000, default value: 72. + +FORTRAN_COMMENT_AFTER = 72 + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = YES + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: +# http://clang.llvm.org/) for more accurate parsing at the cost of reduced +# performance. This can be particularly helpful with template rich C++ code for +# which doxygen's built-in parser lacks the necessary type information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS +# tag is set to YES then doxygen will add the directory of each input to the +# include path. +# The default value is: YES. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_ADD_INC_PATHS = YES + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the directory containing a file called compile_commands.json. This +# file is the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the +# options used when the source files were built. This is equivalent to +# specifying the -p option to a clang tool, such as clang-check. These options +# will then be passed to the parser. Any options specified with CLANG_OPTIONS +# will be added as well. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes) +# that should be ignored while generating the index headers. The IGNORE_PREFIX +# tag works for classes, function and member names. The entity will be placed in +# the alphabetical list under the first letter of the entity name that remains +# after removing the prefix. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = ../_doxygen/header.html + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = ../_doxygen/footer.html + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = ../_doxygen/stylesheet.css + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). +# Note: Since the styling of scrollbars can currently not be overruled in +# Webkit/Chromium, the styling will be left out of the default doxygen.css if +# one or more extra stylesheets have been specified. So if scrollbar +# customization is desired it has to be added explicitly. For an example see the +# documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output +# should be rendered with a dark or light theme. +# Possible values are: LIGHT always generate light mode output, DARK always +# generate dark mode output, AUTO_LIGHT automatically set the mode according to +# the user preference, use light mode if no preference is set (the default), +# AUTO_DARK automatically set the mode according to the user preference, use +# dark mode if no preference is set and TOGGLE allow to user to switch between +# light and dark mode via a button. +# The default value is: AUTO_LIGHT. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE = AUTO_LIGHT + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a color-wheel +# For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use gray-scales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be +# dynamically folded and expanded in the generated HTML source code. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_CODE_FOLDING = YES + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag determines the URL of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDURL = + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# on Windows. In the beginning of 2021 Microsoft took the original page, with +# a.o. the download links, offline the HTML help workshop was already many years +# in maintenance mode). You can download the HTML help workshop from the web +# archives at Installation executable (see: +# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo +# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the main .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# The SITEMAP_URL tag is used to specify the full URL of the place where the +# generated documentation will be placed on the server by the user during the +# deployment of the documentation. The generated sitemap is called sitemap.xml +# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL +# is specified no sitemap is generated. For information about the sitemap +# protocol see https://www.sitemaps.org +# This tag requires that the tag GENERATE_HTML is set to YES. + +SITEMAP_URL = + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine tune the look of the index (see "Fine-tuning the output"). As an +# example, the default style sheet generated by doxygen has an example that +# shows how to put an image at the root of the tree instead of the PROJECT_NAME. +# Since the tree basically has the same information as the tab index, you could +# consider setting DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the +# FULL_SIDEBAR option determines if the side bar is limited to only the treeview +# area (value NO) or if it should extend to the full height of the window (value +# YES). Setting this to YES gives a layout similar to +# https://docs.readthedocs.io with more room for contents, but less room for the +# project logo, title, and description. If either GENERATE_TREEVIEW or +# DISABLE_INDEX is set to NO, this option has no effect. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FULL_SIDEBAR = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email +# addresses. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +OBFUSCATE_EMAILS = YES + +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 16 + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. +# Note that the different versions of MathJax have different requirements with +# regards to the different settings, so it is possible that also other MathJax +# settings have to be changed when switching between the different MathJax +# versions. +# Possible values are: MathJax_2 and MathJax_3. +# The default value is: MathJax_2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_VERSION = MathJax_2 + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. For more details about the output format see MathJax +# version 2 (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 +# (see: +# http://docs.mathjax.org/en/latest/web/components/output.html). +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility. This is the name for Mathjax version 2, for MathJax version 3 +# this will be translated into chtml), NativeMML (i.e. MathML. Only supported +# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This +# is the name for Mathjax version 3, for MathJax version 2 this will be +# translated into HTML-CSS) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. The default value is: +# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 +# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# for MathJax version 2 (see +# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions): +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# For example for MathJax version 3 (see +# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): +# MATHJAX_EXTENSIONS = ams +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /