Add 'projects/rocm-smi-lib/' from commit '7dba992ebd1beef78ec78d0d2974d6aa531786ff'

git-subtree-dir: projects/rocm-smi-lib
git-subtree-mainline: 25536e61be
git-subtree-split: 7dba992ebd
This commit is contained in:
systems-assistant[bot]
2025-07-22 22:52:41 +00:00
184 changed files with 62289 additions and 0 deletions
@@ -0,0 +1,42 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger:
batch: true
branches:
include:
- amd-staging
- amd-mainline
paths:
exclude:
- .github
- docs
- '.*.y*ml'
- '*.md'
- License.txt
pr:
autoCancel: true
branches:
include:
- amd-staging
- amd-mainline
paths:
exclude:
- .github
- docs
- '.*.y*ml'
- '*.md'
- License.txt
drafts: false
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rocm_smi_lib.yml@pipelines_repo
+4
View File
@@ -0,0 +1,4 @@
---
Language: Cpp
BasedOnStyle: Google
ColumnLimit: 100
+25
View File
@@ -0,0 +1,25 @@
# THIS FILE IS GENERATED FROM .clangd!
# Run .update-clang-tidy.sh to regenerate.
Checks:
bugprone*,
clang-analyzer*,
google*,
misc*,
modernize*,
-abseil*,
-bugprone-easily-swappable-parameters,
-bugprone-reserved-identifier,
-clang-analyzer-security.insecureAPI.strcpy,
-cppcoreguidelines*,
-cppcoreguidelines-pro*,
-misc-non-copyable-objects,
-misc-use-anonymous-namespace,
-modernize-avoid-c-arrays,
-modernize-redundant-void-arg,
-modernize-use-auto,
-modernize-use-nodiscard,
-modernize-use-noexcept,
-modernize-use-trailing-return-type,
-modernize-use-using,
-performance*,
-readability*,
+37
View File
@@ -0,0 +1,37 @@
CompileFlags:
Remove: -W*
Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa, -I/opt/rocm/include/rocprofiler]
Compiler: clang++
# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html
Diagnostics:
UnusedIncludes: Strict
# rules below are copied into .clang-tidy using ./.update-clang-tidy.sh
# please keep the rules sorted alphabetically
ClangTidy:
Add: [
bugprone*,
clang-analyzer*,
google*,
misc*,
modernize*,
]
Remove: [
abseil*,
bugprone-easily-swappable-parameters,
bugprone-reserved-identifier,
clang-analyzer-security.insecureAPI.strcpy,
cppcoreguidelines*,
cppcoreguidelines-pro*,
misc-non-copyable-objects,
misc-use-anonymous-namespace,
modernize-avoid-c-arrays,
modernize-redundant-void-arg,
modernize-use-auto,
modernize-use-nodiscard,
modernize-use-noexcept,
modernize-use-trailing-return-type,
modernize-use-using,
performance*,
readability*,
]
+13
View File
@@ -0,0 +1,13 @@
# EditorConfig standardizes spacing in all editors: https://EditorConfig.org
# Please get a plugin for your editor to match the formatting
# top-most EditorConfig file
root = true
# Matches multiple files with brace expansion notation
# Set default charset
[*.{c,cc,cpp,h,hh,hpp}]
charset = utf-8
indent_style = space
indent_size = 2
max_line_length = 100
+5
View File
@@ -0,0 +1,5 @@
* @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan
docs/* @ROCm/rocm-documentation
*.md @ROCm/rocm-documentation
*.rst @ROCm/rocm-documentation
+105
View File
@@ -0,0 +1,105 @@
# Contributing to ROCm SMI #
We welcome contributions to ROCm SMI.
Please follow these details to help ensure your contributions will be successfully accepted.
## Issue Discussion ##
Please use the GitHub Issues tab to notify us of issues.
* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
comment or post to provide additional details, such as how you reproduced this issue.
* If you're not sure if your issue is the same, err on the side of caution and file your issue.
You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
your issue as being the same as the existing issue, we'll close the duplicate.
* If your issue doesn't exist, use the issue template to file a new issue.
* When filing an issue, be sure to provide as much information as possible,
including your amdgpu driver version, GPUs used, and commands ran. This
helps reduce the time required to reproduce your issue.
* Set `export RSMI_LOGGING=1` before running the script. Then include the log
file located here: `/var/log/rocm_smi_lib/ROCm-SMI-lib.log`
* Check your issue regularly, as we may require additional information to successfully reproduce the
issue.
* You may also open an issue to ask questions to the maintainers about whether a proposed change
meets the acceptance criteria, or to discuss an idea pertaining to the library.
## Acceptance Criteria ##
The goal of ROCm SMI project is to provide a simple CLI interface and a library
for interacting with AMD GPUs.
## Coding Style ##
Please refer to `.clang-format`. It is suggested you use `pre-commit` tool.
It mostly follows Google C++ formatting with 100 character line limit.
## Pull Request Guidelines ##
When you create a pull request, you should target the default branch. Our
current default branch is the **develop** branch, which serves as our
integration branch.
### Deliverables ###
For each new file in repository,
Please include the licensing header
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-20XX, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
### Process ###
* Reviewers are listed in the CODEOWNERS file
* Code format guidelines
ROCm SMI uses the clang-format tool for formatting code in source files.
The formatting style is captured in .clang-format which is located at
the root of ROCm SMI. These are different options to follow:
1. Using pre-commit and docker - `pre-commit run`
1. Using only clang-format - `clang-format -i \<path-to-the-source-file\>`
## References ##
1. [pre-commit](https://github.com/pre-commit/pre-commit)
1. [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+18
View File
@@ -0,0 +1,18 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "monthly"
labels:
- "documentation"
- "dependencies"
- "ci:docs-only"
reviewers:
- "samjwu"
+5
View File
@@ -0,0 +1,5 @@
disabled: false
scmId: gh-emu-rocm
branchesToScan:
- amd-staging
- amd-mainline
+15
View File
@@ -0,0 +1,15 @@
name: Rocm Validation Suite KWS
on:
push:
branches: [amd-staging, amd-mainline]
pull_request:
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
kws:
if: ${{ github.event_name == 'pull_request' }}
uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/kws.yml@mainline
secrets: inherit
with:
pr_number: ${{github.event.pull_request.number}}
base_branch: ${{github.base_ref}}
@@ -0,0 +1,25 @@
name: ROCm CI Caller
on:
pull_request:
branches: [amd-staging, release/rocm-rel-*, amd-mainline]
types: [opened, reopened, synchronize]
push:
branches: [amd-mainline]
workflow_dispatch:
issue_comment:
types: [created]
jobs:
call-workflow:
if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest')))
uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
secrets: inherit
with:
input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}}
input_pr_num: ${{github.event_name == 'pull_request' && github.event.pull_request.number || (github.event_name == 'issue_comment' && github.event.issue.number) || 0}}
input_pr_url: ${{github.event_name == 'pull_request' && github.event.pull_request.html_url || (github.event_name == 'issue_comment' && github.event.issue.pull_request.html_url) || ''}}
input_pr_title: ${{github.event_name == 'pull_request' && github.event.pull_request.title || (github.event_name == 'issue_comment' && github.event.issue.pull_request.title) || ''}}
repository_name: ${{ github.repository }}
base_ref: ${{github.event_name == 'pull_request' && github.event.pull_request.base.ref || (github.event_name == 'issue_comment' && github.event.issue.pull_request.base.ref) || github.ref}}
trigger_event_type: ${{ github.event_name }}
comment_text: ${{ github.event_name == 'issue_comment' && github.event.comment.body || '' }}
+35
View File
@@ -0,0 +1,35 @@
# NOTE! Please use 'git ls-files -i --exclude-standard'
# command after changing this file, to see if there are
# any tracked files which get ignored after the change.
# VisualStudioCode
.venv/
.vscode/
_build
# below files are generated via CMake
include/rocm_smi/rocm_smi64Config.h
oam/include/oam/oamConfig.h
python_smi_tools/rsmiBindingsInit.py
# Build directory
build/
# CMake cache
.cache/
# Simulated SYSFS - for early development or debug
device/
# Misc
__pycache__
README
README.html
# do NOT ignore these files
!.clang-format
!.clang-tidy
!.clangd
# avoid duplicating contributing.md due to conf.py
docs/CHANGELOG.md
@@ -0,0 +1,30 @@
# - How to use:
# python3 -m pip install pre-commit
# pre-commit install --install hooks
# Upon a new commit - the hooks should automagically run
#
# - How to skip:
# git commit --no-verify
# or
# SKIP=clang-format-docker git commit
# SKIP=cpplint-docker git commit
fail_fast: false
repos:
# For portability I decided to use Docker containers
- repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint
rev: 0.0.3
hooks:
- id: clang-format-docker
- id: cpplint-docker
# Below is a local way of running formatters and linters
# NOTE: clang-tidy is not used in the above tests
# - repo: https://github.com/pocc/pre-commit-hooks
# rev: v1.3.5
# hooks:
# - id: clang-format
# args: [--no-diff, -i]
# - id: clang-tidy
# args: [-p=build, --quiet]
# - id: cpplint
# args: [--verbose=5]
+20
View File
@@ -0,0 +1,20 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
sphinx:
configuration: docs/conf.py
formats: [htmlzip, pdf, epub]
python:
install:
- requirements: docs/sphinx/requirements.txt
build:
os: ubuntu-22.04
tools:
python: "3.10"
apt_packages:
- "graphviz" # For dot graphs in doxygen
+36
View File
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -x # trace
set -e # exit immediately if command fails
set -u # exit if an undefined variable is found
awk '
BEGIN {
print "# THIS FILE IS GENERATED FROM .clangd!"
print "# Run ./.update-clang-tidy.sh to regenerate."
print "Checks:"
}
/Add: \[$/{
a=1
next
}
/]/{
a=0
}
a{
gsub(/^\s+/," ")
print
}
/Remove: \[$/{
r=1
next
}
/]/{
r=0
}
r{
gsub(/^\s+/," -")
print
}
' .clangd | tee .clang-tidy
+894
View File
@@ -0,0 +1,894 @@
# Changelog for ROCm SMI Library
Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/).
***All information listed below is for reference and subject to change.***
## rocm_smi_lib for ROCm 6.5.0
### Added
- **Added support for GPU metrics 1.8**.
- Added new fields for `rsmi_gpu_metrics_t` including:
- Adding the following metrics to allow new calculations for violation status:
- Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts
- Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts
- Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks.
- Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics).
- Increasing available JPEG engines to 40.
Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
### Changed
- N/A
### Removed
- **Removed backwards compatibility `rsmi_dev_gpu_metrics_info_get()`'s `jpeg_activity` or `vcn_activity` fields: use `xcp_stats.jpeg_busy` or `xcp_stats.vcn_busy`**
- Backwards compability is removed for `jpeg_activity` and `vcn_activity` fields, if the `jpeg_busy` or `vcn_busy` field is available.
- <i>Reasons for this change</i>:
- Providing both `vcn_activity`/`jpeg_activity` and XCP (partition) stats `vcn_busy`/`jpeg_busy` caused confusion for users about which field to use. By removing backward compatibility, it is easier to identify the relevant field.
- The `jpeg_busy` field increased in size (for supported ASICs), making backward compatibility unable to fully copy the structure into `jpeg_activity`.
See below for comparison of updated CLI outputs:
Original output:
```shell
$ rocm-smi --showmetrics
GPU[0] : vcn_activity (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] : jpeg_activity (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
```
New output:
```shell
$ rocm-smi --showmetrics
GPU[0] : vcn_activity (%): ['N/A', 'N/A', 'N/A', 'N/A']
GPU[0] : jpeg_activity (%): ['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[0] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[1] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[2] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[3] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[4] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[5] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[6] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[7] : xcp_stats.jpeg_busy (%): [0, 0, 0, 0, 0, 0, 0, 0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
GPU[0] XCP[0] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[1] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[2] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[3] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[4] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[5] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[6] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
GPU[0] XCP[7] : xcp_stats.vcn_busy (%): [0, 'N/A', 'N/A', 'N/A']
```
### Optimized
- N/A
### Resolved issues
- N/A
### Upcoming changes
- N/A
### Known issues
- N/A
## rocm_smi_lib for ROCm 6.4.1
### Added
- N/A
### Changed
- N/A
### Removed
- N/A
### Optimized
- N/A
### Resolved issues
- **Fixed partition enumeration - now refer to correct DRM Render and Card paths**
Previously, partitions incorrectly reflected the primary node (1st GPU) and showed the DRM Render Minor as renderD128. Partition nodes mirrored renderD128's information, which was incorrect. See the "<i>Previous Outputs in CPX</i>" example below.
Device enumeration was updated to correctly map DRM Render Minor paths. See the "<i>Corrected Outputs in CPX</i>" example below.
These changes impact what information is readable/writable for the partition nodes.
<b><i>Example: Previous Outputs in CPX</b></i>
```shell
$ rocm-smi
============================================ ROCm System Management Interface ============================================
====================================================== Concise Info ======================================================
Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
(DID, GUID) (Junction) (Socket) (Mem, Compute, ID)
==========================================================================================================================
0 2 0x74a1, 18421 45.0°C 152.0W NPS1, CPX, 0 133Mhz 900Mhz 0% auto 750.0W 0% 0%
1 3 0x74a1, 48116 45.0°C 152.0W NPS1, CPX, 1 133Mhz 900Mhz 0% auto 750.0W 0% 0%
2 4 0x74a1, 65524 45.0°C 152.0W NPS1, CPX, 2 138Mhz 900Mhz 0% auto 750.0W 0% 0%
3 5 0x74a1, 1013 45.0°C 152.0W NPS1, CPX, 3 138Mhz 900Mhz 0% auto 750.0W 0% 0%
4 6 0x74a1, 30708 45.0°C 152.0W NPS1, CPX, 4 138Mhz 900Mhz 0% auto 750.0W 0% 0%
5 7 0x74a1, 35829 45.0°C 152.0W NPS1, CPX, 5 153Mhz 900Mhz 0% auto 750.0W 0% 0%
6 8 0x74a1, 53237 45.0°C 152.0W NPS1, CPX, 6 153Mhz 900Mhz 0% auto 750.0W 0% 0%
7 9 0x74a1, 13300 45.0°C 152.0W NPS1, CPX, 7 153Mhz 900Mhz 0% auto 750.0W 0% 0%
8 10 0x74a1, 64360 44.0°C 158.0W NPS1, CPX, 0 144Mhz 900Mhz 0% auto 750.0W 0% 0%
...
==========================================================================================================================
================================================== End of ROCm SMI Log ===================================================
```
<b><i>Example: Corrected outputs in CPX</i></b>
```shell
$ rocm-smi
============================================ ROCm System Management Interface ============================================
====================================================== Concise Info ======================================================
Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
(DID, GUID) (Junction) (Socket) (Mem, Compute, ID)
==========================================================================================================================
0 2 0x74a1, 18421 44.0°C 151.0W NPS1, CPX, 0 132Mhz 900Mhz 0% auto 750.0W 0% 0%
1 3 N/A, 48116 N/A N/A N/A, N/A, 1 N/A N/A 0% n/a N/A 2% N/A
2 4 N/A, 65524 N/A N/A N/A, N/A, 2 N/A N/A 0% n/a N/A 2% N/A
3 5 N/A, 1013 N/A N/A N/A, N/A, 3 N/A N/A 0% n/a N/A 2% N/A
4 6 N/A, 30708 N/A N/A N/A, N/A, 4 N/A N/A 0% n/a N/A 2% N/A
5 7 N/A, 35829 N/A N/A N/A, N/A, 5 N/A N/A 0% n/a N/A 2% N/A
6 8 N/A, 53237 N/A N/A N/A, N/A, 6 N/A N/A 0% n/a N/A 2% N/A
7 9 N/A, 13300 N/A N/A N/A, N/A, 7 N/A N/A 0% n/a N/A 2% N/A
8 10 0x74a1, 64360 44.0°C 158.0W NPS1, CPX, 0 132Mhz 900Mhz 0% auto 750.0W 0% 0%
...
==========================================================================================================================
================================================== End of ROCm SMI Log ===================================================
```
### Upcoming changes
- N/A
### Known issues
- N/A
## rocm_smi_lib for ROCm 6.4
### Added
- **Added support for GPU metrics 1.7 to `rsmi_dev_gpu_metrics_info_get()`**
Updated `rsmi_dev_gpu_metrics_info_get()` and structure `rsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth:
- `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s)
- `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down
- `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status.
- **Added new GPU metrics 1.7 to `rocm-smi --showmetrics`**
New metrics added to `rocm-smi --showmetrics`
```shell
$ rocm-smi --showmetrics
GPU[0] : vram_max_bandwidth (GB/s): 1555
GPU[0] : xgmi_link_status (Up/Down): ['1', '1', '1', '1', '0', '1', '0', '1']
GPU[0] XCP[0] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[1] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[2] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[3] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[4] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[5] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[6] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[0] XCP[7] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] : vram_max_bandwidth (GB/s): 1555
GPU[1] : xgmi_link_status (Up/Down): ['1', '1', '1', '1', '0', '1', '0', '1']
...
GPU[1] XCP[0] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[1] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[2] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[3] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[4] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[5] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[6] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
GPU[1] XCP[7] : xcp_stats.gfx_below_host_limit_acc (%): ['0', '0', '0', '0', '0', '0', '0', '0']
...
```
### Changed
### Removed
### Resolved issues
- **Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying graphics version properly for MI2x, MI1x or Navi 3x ASICs.**
### Upcoming changes
## rocm_smi_lib for ROCm 6.3
- **Added `rsmi_dev_memory_partition_capabilities_get` which returns driver memory partition capablities.**
Driver now has the ability to report what the user can set memory partition modes to. User can now see available
memory partition modes upon an invalid argument return from memory partition mode set (`rsmi_dev_memory_partition_set`).
- **Added support for GPU metrics 1.6 to `rsmi_dev_gpu_metrics_info_get()`**
Updated `rsmi_dev_gpu_metrics_info_get()` and structure `rsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery:
- `uint64_t accumulation_counter` - used for all throttled calculations
- `uint64_t prochot_residency_acc` - Processor hot accumulator
- `uint64_t ppt_residency_acc` - Package Power Tracking (PPT) accumulator (used in PVIOL calculations)
- `uint64_t socket_thm_residency_acc` - Socket thermal accumulator - (used in TVIOL calculations)
- `uint64_t vr_thm_residency_acc` - Voltage Rail (VR) thermal accumulator
- `uint64_t hbm_thm_residency_acc` - High Bandwidth Memory (HBM) thermal accumulator
- `uint16_t num_partition` - corresponds to the current total number of partitions
- `struct amdgpu_xcp_metrics_t xcp_stats[MAX_NUM_XCP]` - for each partition associated with current GPU, provides gfx busy & accumulators, jpeg, and decoder (VCN) engine utilizations
- `uint32_t gfx_busy_inst[MAX_NUM_XCC]` - graphic engine utilization (%)
- `uint16_t jpeg_busy[MAX_NUM_JPEG_ENGS]` - jpeg engine utilization (%)
- `uint16_t vcn_busy[MAX_NUM_VCNS]` - decoder (VCN) engine utilization (%)
- `uint64_t gfx_busy_acc[MAX_NUM_XCC]` - graphic engine utilization accumulated (%)
- `uint32_t pcie_lc_perf_other_end_recovery` - corresponds to the pcie other end recovery counter
- **Added ability to view raw GPU metrics`rocm-smi --showmetrics`**
Users can now view GPU metrics from our new `rocm-smi --showmetrics`. Unlike AMD SMI (or other ROCM-SMI interfaces), these values are ***not*** converted into applicable units as users may see in `amd-smi metric`. Units listed display as indicated by the driver, they are not converted (eg. in other AMD SMI/ROCm SMI interfaces which use the data provided). It is important to note, that fields displaying `N/A` data mean this ASIC does not support or backward compatibility was not provided in a newer ASIC's GPU metric structure.
### Changed
- **Added back in C++ tests for `memorypartition_read_write`**.
Due to driver adding in all needed features for memory partition write. We have re-enabled memorypartition_read_write.
- **Updated `rsmi_dev_memory_partition_set` to not return until a successful restart of AMD GPU Driver.**
This change keeps checking for ~ up to 40 seconds for a successful restart of the AMD GPU driver. Additionally, the API call continues to check if memory partition (NPS) SYSFS files are successfully updated to reflect the user's requested memory partition (NPS) mode change. Otherwise, reports an error back to the user. Due to these changes, we have updated ROCm SMI's CLI to reflect the maximum wait of 40 seconds, while memory partition change is in progress.
- **All APIs now have the ability to catch driver reporting invalid arguments.**
Now ROCm SMI APIs can show RSMI_STATUS_INVALID_ARGS when driver returns EINVAL.
### Removed
- **Removed `--resetcomputepartition`, and `--resetmemorypartition` options and associated APIs**.
- This change is part of the partition feature redesign.
- The related APIs `rsmi_dev_compute_partition_reset()` and `rsmi_dev_memory_partition_reset()`.
### Resolved issues
- **Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying properly for MI2x or Navi 3x ASICs.**
### Upcoming changes
- **Re-enable C++ tests for `memorypartition_read_write`**.
- This change is part of the partition feature redesign.
- SMI's workflow needs to be adjusted in order to accomidate incoming driver changes to enable
Dynamic memory partition feature. We plan on re-enabling testing for this feature during ROCm
6.4.
## rocm_smi_lib for ROCm 6.2.1
### Optimized
- **Improved handling of UnicodeEncodeErrors with non UTF-8 locales**
Non UTF-8 locales were causing crashing on UTF-8 special characters
### Resolved issues
- **Fixed rsmitstReadWrite.TestComputePartitionReadWrite segfault**
Segfault was caused due to unhandled start conditions:
1) When setting CPX as a partition mode, there is a DRM node limitation of 64.
This is a known limitation of the driver, if other drivers are using other DRM nodes (ex. using PCIe space, such as ast).
The number of DRM nodes can be checked via `ls /sys/class/drm`
Recommended steps for removing unnecessary drivers:
a. unloading amdgpu - `sudo rmmod amdgpu`
b. removing unnecessary driver(s) - ex. `sudo rmmod ast`
c. reload amgpu - `sudo modprobe amdgpu`
2) Since user could start amdgpu in different partition modes (ex. `sudo modprobe amdgpu user_partt_mode=1`).
Test needed to keep track of total number of devices, in order to ensure test comes back to the original configuration.
The test segfault could be seen on all MI3x ASICs, if brought up in a non-SPX configuration upon boot.
## rocm_smi_lib for ROCm 6.2
### Changed
- **Added Partition ID API (`rsmi_dev_partition_id_get(..)`)**
Previously `rsmi_dev_partition_id_get` could only be retrived by querying through `rsmi_dev_pci_id_get()`
and parsing optional bits in our python CLI/API. We are now making this available directly through API.
As well as added testing, in our compute partitioning tests verifing partition IDs update accordingly.
### Resolved issues
- **Partition ID CLI output**
Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `rsmi_dev_partition_id_get(..)`, we provided this fallback to properly retreive partition ID. We
plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description.
- bits [63:32] = domain
- bits [31:28] or bits [2:0] = partition id
- bits [27:16] = reserved
- bits [15:8] = Bus
- bits [7:3] = Device
- bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
## rocm_smi_lib for ROCm 6.1.2
### Added
- **Added Ring Hang event**
Added `RSMI_EVT_NOTIF_RING_HANG` to the possible events in the `rsmi_evt_notification_type_t` enum.
### Resolved issues
- **Fixed parsing of `pp_od_clk_voltage` within `get_od_clk_volt_info`**
The parsing of `pp_od_clk_voltage` was not dynamic enough to work with the dropping of voltage curve support on MI series cards.
## rocm_smi_lib for ROCm 6.1.1
### Added
- **Unlock mutex if process is dead**
Added in order to unlock mutex when process is dead. Additional debug output has been added if futher issues are detected.
- **Added Partition ID to rocm-smi CLI**
`rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function.
- bits [63:32] = domain
- bits [31:28] = partition id
- bits [27:16] = reserved
- bits [15: 0] = pci bus/device/function
rocm-smi now provides partition ID in `rocm-smi` and `rocm-smi --showhw`. If device supports partitioning and is in a non-SPX mode (CPX, DPX,TPX,... etc) partition ID will be non-zero. In SPX and non-supported devices will show as 0. See examples provided below.
```shell
$ rocm-smi
========================================= ROCm System Management Interface =========================================
=================================================== Concise Info ===================================================
Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
(DID, GUID) (Edge) (Avg) (Mem, Compute, ID)
====================================================================================================================
0 1 0x73bf, 34495 43.0°C 6.0W N/A, N/A, 0 0Mhz 96Mhz 0% manual 150.0W 3% 0%
1 2 0x73a3, 22215 34.0°C 8.0W N/A, N/A, 0 0Mhz 96Mhz 20.0% manual 213.0W 0% 0%
====================================================================================================================
=============================================== End of ROCm SMI Log ================================================
```
*Device below is in TPX*
```shell
$ rocm-smi --showhw
================================= ROCm System Management Interface =================================
====================================== Concise Hardware Info =======================================
GPU NODE DID GUID GFX VER GFX RAS SDMA RAS UMC RAS VBIOS BUS PARTITION ID
0 4 0x74a0 3877 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 0
1 5 0x74a0 54196 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 1
2 6 0x74a0 36891 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 2
3 7 0x74a0 28397 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 0
4 8 0x74a0 45692 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 1
5 9 0x74a0 61907 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 2
6 10 0x74a0 52404 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 0
7 11 0x74a0 4133 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 1
8 12 0x74a0 21386 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 2
9 13 0x74a0 10876 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 0
10 14 0x74a0 63213 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 1
11 15 0x74a0 46402 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 2
====================================================================================================
======================================= End of ROCm SMI Log ========================================
```
- **Added `NODE`, `GUID`, and `GFX Version`**
Changes impact the following rocm-smi CLIs:
- `rocm-smi`
- `rocm-smi -i`
- `rocm-smi --showhw`
- `rocm-smi --showproduct`
`NODE` - is the KFD node, since these can both be CPU and GPU devices. This field is invariant between boots.
`GUID` - also known as GPU ID. GUID is the KFD GPU's ID. This field has a chance to be variant between boots.
`GFX Version` - this is the device's target graphics version.
See below for a few example outputs.
```shell
$ rocm-smi --showhw
================================= ROCm System Management Interface =================================
====================================== Concise Hardware Info =======================================
GPU NODE DID GUID GFX VER GFX RAS SDMA RAS UMC RAS VBIOS BUS PARTITION ID
0 4 0x74a0 3877 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 0
1 5 0x74a0 54196 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 1
2 6 0x74a0 36891 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 2
3 7 0x74a0 28397 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 0
4 8 0x74a0 45692 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 1
5 9 0x74a0 61907 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 2
6 10 0x74a0 52404 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 0
7 11 0x74a0 4133 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 1
8 12 0x74a0 21386 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 2
9 13 0x74a0 10876 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 0
10 14 0x74a0 63213 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 1
11 15 0x74a0 46402 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 2
====================================================================================================
======================================= End of ROCm SMI Log ========================================
```
```shell
$ rocm-smi -i
============================ ROCm System Management Interface ============================
=========================================== ID ===========================================
GPU[0] : Device Name: Aqua Vanjaram [Instinct MI300A]
GPU[0] : Device ID: 0x74a0
GPU[0] : Device Rev: 0x00
GPU[0] : Subsystem ID: 0x74a0
GPU[0] : GUID: 60294
GPU[1] : Device Name: Aqua Vanjaram [Instinct MI300A]
GPU[1] : Device ID: 0x74a0
GPU[1] : Device Rev: 0x00
GPU[1] : Subsystem ID: 0x74a0
GPU[1] : GUID: 35406
GPU[2] : Device Name: Aqua Vanjaram [Instinct MI300A]
GPU[2] : Device ID: 0x74a0
GPU[2] : Device Rev: 0x00
GPU[2] : Subsystem ID: 0x74a0
GPU[2] : GUID: 10263
GPU[3] : Device Name: Aqua Vanjaram [Instinct MI300A]
GPU[3] : Device ID: 0x74a0
GPU[3] : Device Rev: 0x00
GPU[3] : Subsystem ID: 0x74a0
GPU[3] : GUID: 52959
==========================================================================================
================================== End of ROCm SMI Log ===================================
```
```shell
$ rocm-smi --showproduct
============================ ROCm System Management Interface ============================
====================================== Product Info ======================================
GPU[0] : Card Series: Aqua Vanjaram [Instinct MI300A]
GPU[0] : Card Model: 0x74a0
GPU[0] : Card Vendor: Advanced Micro Devices, Inc. [AMD/ATI]
GPU[0] : Card SKU: N/A
GPU[0] : Subsystem ID: 0x74a0
GPU[0] : Device Rev: 0x00
GPU[0] : Node ID: 4
GPU[0] : GUID: 60294
GPU[0] : GFX Version: gfx942
GPU[1] : Card Series: Aqua Vanjaram [Instinct MI300A]
GPU[1] : Card Model: 0x74a0
GPU[1] : Card Vendor: Advanced Micro Devices, Inc. [AMD/ATI]
GPU[1] : Card SKU: N/A
GPU[1] : Subsystem ID: 0x74a0
GPU[1] : Device Rev: 0x00
GPU[1] : Node ID: 5
GPU[1] : GUID: 35406
GPU[1] : GFX Version: gfx942
GPU[2] : Card Series: Aqua Vanjaram [Instinct MI300A]
GPU[2] : Card Model: 0x74a0
GPU[2] : Card Vendor: Advanced Micro Devices, Inc. [AMD/ATI]
GPU[2] : Card SKU: N/A
GPU[2] : Subsystem ID: 0x74a0
GPU[2] : Device Rev: 0x00
GPU[2] : Node ID: 6
GPU[2] : GUID: 10263
GPU[2] : GFX Version: gfx942
GPU[3] : Card Series: Aqua Vanjaram [Instinct MI300A]
GPU[3] : Card Model: 0x74a0
GPU[3] : Card Vendor: Advanced Micro Devices, Inc. [AMD/ATI]
GPU[3] : Card SKU: N/A
GPU[3] : Subsystem ID: 0x74a0
GPU[3] : Device Rev: 0x00
GPU[3] : Node ID: 7
GPU[3] : GUID: 52959
GPU[3] : GFX Version: gfx942
==========================================================================================
================================== End of ROCm SMI Log ===================================
```
- **Documentation now includes C++ and Python: tutorials, API guides, and C++ reference pages**
See [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/) once 6.1.1 is released.
### Changed
- **Aligned `rocm-smi` fields display "N/A" instead of "unknown"/"unsupported": `Card ID`, `DID`, `Model`, `SKU`, and `VBIOS`**
Impacts the following commands:
- `rocm-smi` - see other examples above for 6.1.1
- `rocm-smi --showhw` - see other examples above for 6.1.1
- `rocm-smi --showproduct` - see other examples above for 6.1.1
- `rocm-smi -i` - see other examples above for 6.1.1
- `rocm-smi --showvbios` - see example below
```shell
$ rocm-smi --showvbios
============================ ROCm System Management Interface ============================
========================================= VBIOS ==========================================
GPU[0] : VBIOS version: N/A
GPU[1] : VBIOS version: N/A
GPU[2] : VBIOS version: N/A
GPU[3] : VBIOS version: N/A
==========================================================================================
================================== End of ROCm SMI Log ===================================
```
- **Removed stacked id formatting in `rocm-smi`**
This is to simplify identifiers helpful to users. More identifiers can be found on:
- `rocm-smi -i`
- `rocm-smi --showhw`
- `rocm-smi --showproduct`
See examples shown above for 6.1.1. Previous output example can be seen below.
```shell
$ rocm-smi
========================================== ROCm System Management Interface ==========================================
==================================================== Concise Info ====================================================
Device [Model : Revision] Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
Name (20 chars) (Junction) (Socket) (Mem, Compute)
======================================================================================================================
0 [0x74a0 : 0x00] 40.0°C 102.0W NPS1, SPX 31Mhz 1300Mhz 0% manual 550.0W 0% 0%
Aqua Vanjaram [Insti
======================================================================================================================
================================================ End of ROCm SMI Log =================================================
```
### Resolved issues
- **Fixed HIP and ROCm SMI mismatch on GPU bus assignments**
These changes prompted us to to provide better visability for our device nodes and partition IDs (see addition provided above). See examples below for fix overview.
1. MI300a GPU device `Domain:Bus:Device.function` clashes with another AMD USB device
Cause(s):
a. ROCm SMI did not propagate domain consistently (for partitioned devices)
b. AMD GPU driver previously reported partition IDs within function node - causing clash with the other AMD USB device PCIe ID displayed.
2. Domain does not propagate for devices which support partitioning (MI300x/a)
Cause(s):
a. ROCm SMI did not propagate domain consistently (for partitioned devices)
3. Displayed topology will show disordered nodes when compared to HIP
Cause(s):
a. ROCm SMI did not propogate domain consistently (for partitioned devices)
*Device in TPX*
```shell
$ rocm-smi --showhw
================================= ROCm System Management Interface =================================
====================================== Concise Hardware Info =======================================
GPU NODE DID GUID GFX VER GFX RAS SDMA RAS UMC RAS VBIOS BUS PARTITION ID
0 4 0x74a0 3877 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 0
1 5 0x74a0 54196 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 1
2 6 0x74a0 36891 gfx942 ENABLED ENABLED DISABLED N/A 0000:01:00.0 2
3 7 0x74a0 28397 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 0
4 8 0x74a0 45692 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 1
5 9 0x74a0 61907 gfx942 ENABLED ENABLED DISABLED N/A 0001:01:00.0 2
6 10 0x74a0 52404 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 0
7 11 0x74a0 4133 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 1
8 12 0x74a0 21386 gfx942 ENABLED ENABLED DISABLED N/A 0002:01:00.0 2
9 13 0x74a0 10876 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 0
10 14 0x74a0 63213 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 1
11 15 0x74a0 46402 gfx942 ENABLED ENABLED DISABLED N/A 0003:01:00.0 2
====================================================================================================
======================================= End of ROCm SMI Log ========================================
$ lspci -D|grep -i "process\|usb"
0000:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Aqua Vanjaram [Instinct MI300A]
0000:01:00.1 USB controller: Advanced Micro Devices, Inc. [AMD] Device 14df
0001:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Aqua Vanjaram [Instinct MI300A]
0002:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Aqua Vanjaram [Instinct MI300A]
0003:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Aqua Vanjaram [Instinct MI300A]
```
```shell
$ rocm-smi ----showtoponuma
======================================= Numa Nodes =======================================
GPU[0] : (Topology) Numa Node: 0
GPU[0] : (Topology) Numa Affinity: 0
GPU[1] : (Topology) Numa Node: 0
GPU[1] : (Topology) Numa Affinity: 0
GPU[2] : (Topology) Numa Node: 0
GPU[2] : (Topology) Numa Affinity: 0
GPU[3] : (Topology) Numa Node: 1
GPU[3] : (Topology) Numa Affinity: 1
GPU[4] : (Topology) Numa Node: 1
GPU[4] : (Topology) Numa Affinity: 1
GPU[5] : (Topology) Numa Node: 1
GPU[5] : (Topology) Numa Affinity: 1
GPU[6] : (Topology) Numa Node: 2
GPU[6] : (Topology) Numa Affinity: 2
GPU[7] : (Topology) Numa Node: 2
GPU[7] : (Topology) Numa Affinity: 2
GPU[8] : (Topology) Numa Node: 2
GPU[8] : (Topology) Numa Affinity: 2
GPU[9] : (Topology) Numa Node: 3
GPU[9] : (Topology) Numa Affinity: 3
GPU[10] : (Topology) Numa Node: 3
GPU[10] : (Topology) Numa Affinity: 3
GPU[11] : (Topology) Numa Node: 3
GPU[11] : (Topology) Numa Affinity: 3
================================== End of ROCm SMI Log ===================================
```
- **Fixed memory leaks**
Caused by not closing directories and creating maps nodes instead of checking using by using .at().
- **Fixed Python rocm_smi API calls**
Fixed initializing calls which reuse rocmsmi.initializeRsmi() bindings.
```shell
Traceback (most recent call last):
File "/home/charpoag/rocmsmi_pythonapi.py", line 9, in <module>
rocm_smi.initializeRsmi()
File "/opt/rocm/libexec/rocm_smi/rocm_smi.py", line 3531, in initializeRsmi
ret_init = rocmsmi.rsmi_init(0)
NameError: name 'rocmsmi' is not defined
```
- **Fixed rsmi_dev_activity_metric_get gfx/memory activity does not update with GPU activity**
Checks and forces rereading gpu metrics unconditionally.
## rocm_smi_lib for ROCm 6.1.0
### Added
- **Added support to set max/min clock level for sclk (`RSMI_CLK_TYPE_SYS`) or mclk (`RSMI_CLK_TYPE_MEM`)**
Users can now set a maximum or minimum sclk or mclk value through `rsmi_dev_clk_extremum_set()` API provided ASIC support. Alternatively, users can
use our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
```shell
$ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100
============================ ROCm System Management Interface ============================
******WARNING******
Operating your AMD GPU outside of official AMD specifications or outside of
factory settings, including but not limited to the conducting of overclocking,
over-volting or under-volting (including use of this interface software,
even if such software has been directly or indirectly provided by AMD or otherwise
affiliated in any way with AMD), may cause damage to your AMD GPU, system components
and/or result in system failure, as well as cause other problems.
DAMAGES CAUSED BY USE OF YOUR AMD GPU OUTSIDE OF OFFICIAL AMD SPECIFICATIONS OR
OUTSIDE OF FACTORY SETTINGS ARE NOT COVERED UNDER ANY AMD PRODUCT WARRANTY AND
MAY NOT BE COVERED BY YOUR BOARD OR SYSTEM MANUFACTURER'S WARRANTY.
Please use this utility with caution.
Do you accept these terms? [y/N] y
================================ Set Valid sclk Extremum =================================
GPU[0] : Successfully set max sclk to 2100(MHz)
GPU[1] : Successfully set max sclk to 2100(MHz)
GPU[2] : Successfully set max sclk to 2100(MHz)
GPU[3] : Successfully set max sclk to 2100(MHz)
================================== End of ROCm SMI Log ===================================
```
- **Added `rsmi_dev_target_graphics_version_get()`**
Users can now query through ROCm SMI API (`rsmi_dev_target_graphics_version_get()`) to retreive the target graphics version for a GPU device. Currently, this output is not supplied through our rocm-smi CLI.
### Changed
- **Removed non-unified API headers: Individual GPU metric APIs are no longer supported**
The individual metric APIs (`rsmi_dev_metrics_*`) were removed in order to keep updates easier for new GPU metric support. By providing a simple API (`rsmi_dev_gpu_metrics_info_get()`) with its reported device metrics, it is worth noting there is a risk for ABI break-age using `rsmi_dev_gpu_metrics_info_get()`. It is vital to understand, that ABI breaks are necessary (in some cases) in order to support newer ASICs and metrics for our customers. We will continue to support `rsmi_dev_gpu_metrics_info_get()` with these considerations and limitations in mind.
- **Depricated rsmi_dev_power_ave_get(), use newer API rsmi_dev_power_get()**
As outlined in change below for 6.0.0 (***Added a generic power API: rsmi_dev_power_get***), is now depricated. Please update your ROCm SMI API calls accordingly.
### Resolved issues
- Fix `--showpids` reporting `[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN`
Output was failing because cu_occupancy debugfs method is not provided on some graphics cards by design. `get_compute_process_info_by_pid` was updated to reflect this and returns with output needed by CLI.
- Fix `rocm-smi --showpower` output was inconsistent on Navi32/31 devices
Updated to use `rsmi_dev_power_get()` within CLI to provide a consistent device power output. This was caused due to using the now depricated `rsmi_dev_average_power_get()` API.
- Fixed `rocm-smi --setcomputepartition` and `rocm-smi --resetcomputepartition` to notate if device is EBUSY
- Fixed `rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition` read only SYSFS to return RSMI_STATUS_NOT_SUPPORTED
The `rsmi_dev_memory_partition_set` API is updated to handle the readonly SYSFS check. Corresponding tests and CLI (`rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition`) calls were updated accordingly.
- Fix `rocm-smi --showclkvolt` and `rocm-smi --showvc` displaying 0 for overdrive and voltage curve is not supported
### Known issues
- **HIP and ROCm SMI mismatch on GPU bus assignments**
Three separate issues have been identified:
1. MI300a GPU device `Domain:Bus:Device.function` clashes with another AMD USB device
```shell
$ lspci|grep -i "process\|usb"
0000:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 74a0
0000:01:00.1 USB controller: Advanced Micro Devices, Inc. [AMD] Device 14df
0001:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 74a0
0002:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 74a0
0003:01:00.0 Processing accelerators: Advanced Micro Devices, Inc. [AMD/ATI] Device 74a0
```
```shell
$ rocm-smi --showbus
============================ ROCm System Management Interface ============================
======================================= PCI Bus ID =======================================
GPU[0] : PCI Bus: 0000:01:00.0
GPU[1] : PCI Bus: 0000:01:00.1
GPU[2] : PCI Bus: 0000:01:00.2
GPU[3] : PCI Bus: 0000:01:00.3
...
==========================================================================================
================================== End of ROCm SMI Log ===================================
```
2. Domain does not propagate for devices which support partitioning (MI300x/a)
For example, a device in non-SPX (single partition) - devices will overlap in function device.
```shell
$ rocm-smi --showbus
============================ ROCm System Management Interface ============================
======================================= PCI Bus ID =======================================
GPU[0] : PCI Bus: 0000:01:00.0
GPU[1] : PCI Bus: 0000:01:00.1
GPU[2] : PCI Bus: 0000:01:00.1
GPU[3] : PCI Bus: 0000:01:00.1
GPU[4] : PCI Bus: 0000:01:00.1
GPU[5] : PCI Bus: 0000:01:00.2
GPU[6] : PCI Bus: 0000:01:00.2
GPU[7] : PCI Bus: 0000:01:00.2
GPU[8] : PCI Bus: 0000:01:00.2
GPU[9] : PCI Bus: 0000:01:00.3
GPU[10] : PCI Bus: 0000:01:00.3
GPU[11] : PCI Bus: 0000:01:00.3
GPU[12] : PCI Bus: 0000:01:00.3
GPU[13] : PCI Bus: 0000:01:00.4
GPU[14] : PCI Bus: 0000:01:00.4
GPU[15] : PCI Bus: 0000:01:00.4
GPU[16] : PCI Bus: 0000:01:00.4
GPU[17] : PCI Bus: 0000:01:00.5
GPU[18] : PCI Bus: 0000:01:00.5
GPU[19] : PCI Bus: 0000:01:00.5
GPU[20] : PCI Bus: 0000:01:00.5
GPU[21] : PCI Bus: 0001:01:00.0
GPU[22] : PCI Bus: 0002:01:00.0
GPU[23] : PCI Bus: 0003:01:00.0
================================== End of ROCm SMI Log ===================================
```
3. Displayed topology will show disordered nodes when compared to HIP
See rocm-smi output vs transferbench.
```shell
rocm-smi --showtopo option is not displaying the correct information when the MI300 driver is loaded in TPX mode.
============================ ROCm System Management Interface ============================
================================ Weight between two GPUs =================================
get_link_weight_topology, Not supported on the given system
ERROR: GPU[1] : Cannot read Link Weight: Not supported on this machine
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 GPU8 GPU9 GPU10 GPU11
GPU0 0 XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI
GPU1 XGMI 0 XXXX XXXX XXXX XGMI XGMI XGMI XGMI XGMI XGMI XGMI
GPU2 XGMI XXXX 0 XXXX XXXX XGMI XGMI XGMI XGMI XGMI XGMI XGMI
GPU3 XGMI XXXX XXXX 0 XXXX XGMI XGMI XGMI XGMI XGMI XGMI XGMI
GPU4 XGMI XXXX XXXX XXXX 0 XGMI XGMI XGMI XGMI XGMI XGMI XGMI
GPU5 XGMI XGMI XGMI XGMI XGMI 0 XXXX XXXX XXXX XGMI XGMI XGMI
GPU6 XGMI XGMI XGMI XGMI XGMI XXXX 0 XXXX XXXX XGMI XGMI XGMI
GPU7 XGMI XGMI XGMI XGMI XGMI XXXX XXXX 0 XXXX XGMI XGMI XGMI
GPU8 XGMI XGMI XGMI XGMI XGMI XXXX XXXX XXXX 0 XGMI XGMI XGMI
GPU9 XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI 0 XGMI XGMI
GPU10 XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI 0 XGMI
GPU11 XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI XGMI 0
======================================= Numa Nodes =======================================
GPU[0] : (Topology) Numa Node: 0
GPU[0] : (Topology) Numa Affinity: 0
GPU[1] : (Topology) Numa Node: 0
GPU[1] : (Topology) Numa Affinity: 0
GPU[2] : (Topology) Numa Node: 0
GPU[2] : (Topology) Numa Affinity: 1
GPU[3] : (Topology) Numa Node: 0
GPU[3] : (Topology) Numa Affinity: 2
GPU[4] : (Topology) Numa Node: 0
GPU[4] : (Topology) Numa Affinity: 3
GPU[5] : (Topology) Numa Node: 0
GPU[5] : (Topology) Numa Affinity: 0
GPU[6] : (Topology) Numa Node: 0
GPU[6] : (Topology) Numa Affinity: 1
GPU[7] : (Topology) Numa Node: 0
GPU[7] : (Topology) Numa Affinity: 2
GPU[8] : (Topology) Numa Node: 0
GPU[8] : (Topology) Numa Affinity: 3
GPU[9] : (Topology) Numa Node: 1
GPU[9] : (Topology) Numa Affinity: 1
GPU[10] : (Topology) Numa Node: 2
GPU[10] : (Topology) Numa Affinity: 2
GPU[11] : (Topology) Numa Node: 3
GPU[11] : (Topology) Numa Affinity: 3
================================== End of ROCm SMI Log ===================================
```
```shell
./Transferbench
...
| GPU 00 | GPU 01 | GPU 02 | GPU 03 | GPU 04 | GPU 05 | GPU 06 | GPU 07 | PCIe Bus ID | #CUs | Closest NUMA | DMA engines
--------+--------+--------+--------+--------+--------+--------+--------+--------+--------------+------+-------------+------------
GPU 00 | - | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | 0000:0c:00.0 | 304 | 0 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 01 | XGMI-1 | - | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | 0000:22:00.0 | 304 | 0 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 02 | XGMI-1 | XGMI-1 | - | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | 0000:38:00.0 | 304 | 0 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 03 | XGMI-1 | XGMI-1 | XGMI-1 | - | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | 0000:5c:00.0 | 304 | 0 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 04 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | - | XGMI-1 | XGMI-1 | XGMI-1 | 0000:9f:00.0 | 304 | 1 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 05 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | - | XGMI-1 | XGMI-1 | 0000:af:00.0 | 304 | 1 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 06 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | - | XGMI-1 | 0000:bf:00.0 | 304 | 1 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
GPU 07 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | XGMI-1 | - | 0000:df:00.0 | 304 | 1 |0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
...
```
## rocm_smi_lib for ROCm 6.0.0
### Added
- **Added rocm-smi --version**
The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.
- The ROCM-SMI version is the CLI/tool version number with commit ID appended after `+` sign.
- The ROCM-SMI-LIB version is the library package version number.
```
$ rocm-smi --version
ROCM-SMI version: 2.0.0+8e78352
ROCM-SMI-LIB version: 6.0.0
```
- **Added support for gfx941/gfx942 metrics**
You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. Users can query through `rsmi_dev_gpu_metrics_info_get()`.
- **Compute and memory partition support**
Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. If your ASIC supports these features, the following commands can help get started:
- `rocm-smi --showcomputepartition`
- `rocm-smi --setcomputepartition <SPX, DPX, CPX, TPX, QPX>`
- `rocm-smi --resetcomputepartition`
- `rocm-smi --showmemorypartition`
- `rocm-smi --setmemorypartition <NPS1, NPS2, NPS4, NPS8>`
- `rocm-smi --resetmemorypartition`
### Changed
- **GPU index sorting made consistent with other tools**
To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number.
- **Increase max BDF ID length**
To allow for larger BDF data, we have increased the maximum BDF length from 256 to 512 buffer size.
- **Documentation is transitioning to Sphinx**
Sphinx allows us to generate code documentation easier for our users. Helps us provide centrized HTML documentation at single website location. Here customers can see how to use our software and tools.
- **Added a generic power API: `rsmi_dev_power_get()`**
Older ASICs provided average socket power, newer ASICs (MI300) provide current socket power. The generic API provides one interface to retreive either of these power readings, allowing backwards compatability.
- **Added flexible temperature readings (`rocm-smi` and `rocm-smi --showtempgraph`)**
Older ASICs provided edge temperature, newer ASICs (MI300) provide junction socket power (not edge). The rocm-smi CLI now provides a way to view which type of temperature is read across all sockets.
- **Added deep sleep frequency readings**
Newer ASICs (MI300) provide ability to know if a clock is in deep sleep.
### Optimized
- Add new test to measure api execution time.
- Remove the shared mutex if no process is using it.
- Updated to C++17, gtest-1.14, and cmake 3.14
### Resolved issues
- Fix memory usage division by 0
- Fix missing firmware blocks (rocm-smi --showfw)
- Fix rocm-smi --showevents shows wrong gpuID
## rocm_smi_lib for ROCm 5.5.0
### Added
- ROCm SMI CLI: Add --showtempgraph Feature.
### Changed
- Relying on vendor ID to detect AMDGPU.
- Change pragma message to warning for backward compatibility.
### Optimized
- Add new test to measure api execution time.
- Remove the shared mutex if no process is using it.
### Resolved issues
- Fix --showproductname when device's SKU cannot be parsed out of the VBIOS string.
- Fix compile error: memcpy was not declared.
- Fix order of CE and UE reporting in ROCm SMI CLI.
- Handle error return value from ReadSysfsStr function.
+411
View File
@@ -0,0 +1,411 @@
#
# Minimum version of cmake and C++ required
#
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" CMake ROCm SMI (Library) [root] ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
cmake_minimum_required(VERSION 3.14)
project(rocm_smi_lib)
set(ROCM_SMI_LIBS_TARGET "rocm_smi_libraries")
set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or not.")
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
## Include common cmake modules
include(utils)
find_package(PkgConfig)
# Default libdir to "lib", this skips GNUInstallDirs from trying to take a guess if it's unset:
set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory")
if (NOT DEFINED CPACK_RESOURCE_FILE_LICENSE)
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/License.txt")
endif()
set(ROCM_SMI "rocm_smi")
set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}")
set(ROCM_SMI_TARGET "${ROCM_SMI}64")
set(ROCM_SMI_LIB_NAME "lib${ROCM_SMI_TARGET}")
# Expose project info to IDEs
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
option(BUILD_TESTS "Build test suite" OFF)
set(SHARE_INSTALL_PREFIX
"share/${ROCM_SMI}"
CACHE STRING "Tests and Example install directory")
# provide git to utilities
find_program (GIT NAMES git)
# sets DRM_INCLUDE_DIRS
pkg_check_modules(DRM REQUIRED libdrm)
## Setup the package version based on git tags.
set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver")
get_package_version_number("7.6.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
message("Package version: ${PKG_VERSION_STR}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_BUILD "0")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_HASH "${PKG_VERSION_HASH}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_STRING "${${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR}.${${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR}.${${ROCM_SMI_LIBS_TARGET}_VERSION_PATCH}+${${ROCM_SMI_LIBS_TARGET}_VERSION_HASH}")
# The following default version values should be updated as appropriate for
# ABI breaks (update MAJOR and MINOR), and ABI/API additions (update MINOR).
# Until ABI stabilizes VERSION_MAJOR will be 0. This should be over-ridden
# by git tags (through "git describe") when they are present.
set(PKG_VERSION_MAJOR "${VERSION_MAJOR}")
set(PKG_VERSION_MINOR "${VERSION_MINOR}")
set(PKG_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}")
set(PKG_VERSION_NUM_COMMIT 0)
## Define default variable and variables for the optional build target
## rocm_smi_lib
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory.")
set(COMMON_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location source code common root.")
set(ROCM_SMI_PACKAGE rocm-smi-lib)
project(${ROCM_SMI_LIBS_TARGET})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
include(GNUInstallDirs)
set(COMMON_PROJ_ROOT ${PROJECT_SOURCE_DIR})
if (CMAKE_COMPILER_IS_GNUCC AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0)
message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION})
message(FATAL_ERROR "Require at least gcc-5.4.0")
endif()
## Compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti")
if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -m64 -msse -msse2")
endif()
# Security options
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wconversion -Wcast-align ")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wformat=2 -fno-common -Wstrict-overflow ")
# Intentionally leave out -Wsign-promo. It causes spurious warnings.
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Woverloaded-virtual -Wreorder ")
# Clang does not set the build-id
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--build-id=sha1")
endif()
# Use this instead of above for 32 bit
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
if ("${CMAKE_BUILD_TYPE}" STREQUAL Release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG")
endif ()
## Address Sanitize Flag
if (${ADDRESS_SANITIZER})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g -fno-omit-frame-pointer")
set(CMAKE_EXE_LINKER_FLAGS -fsanitize=address)
if (BUILD_SHARED_LIBS)
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libasan")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libasan")
endif()
else ()
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan")
else()
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan")
endif()
endif ()
else ()
## Security breach mitigation flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -DFORTIFY_SOURCE=2 -fstack-protector-all -Wcast-align")
## More security breach mitigation flags
set(HARDENING_LDFLAGS
"${HARDENING_LDFLAGS} -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${HARDENING_LDFLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HARDENING_LDFLAGS}")
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-Wtrampolines" CXX_SUPPORTS_WTRAMPOLINES)
if (CXX_SUPPORTS_WTRAMPOLINES)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtrampolines")
endif ()
endif ()
set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src")
set(COMMON_INC_DIR "${PROJECT_SOURCE_DIR}/include/rocm_smi")
set(SHR_MUTEX_DIR "${PROJECT_SOURCE_DIR}/third_party/shared_mutex")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex)
set(CMN_SRC_LIST "${COMMON_SRC_DIR}/rocm_smi_device.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_main.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_monitor.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_power_mon.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_utils.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_counters.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_kfd.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_gpu_metrics.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_logger.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_properties.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_lib_loader.cc")
set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc")
set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_main.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_monitor.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_power_mon.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_utils.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_common.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_exception.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_counters.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_gpu_metrics.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_properties.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_lib_loader.h")
set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h")
## set components
if(ENABLE_ASAN_PACKAGING)
# ASAN Package requires only asan component with libraries and license file
set(CPACK_COMPONENTS_ALL asan)
else()
set(CPACK_COMPONENTS_ALL dev tests)
endif()
# Enable Component Mode & Install Settings
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
# Add dependencies
# no rocm-core
# python doesn't need to be asan
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6, python3")
set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "${CPACK_DEBIAN_PACKAGE_DEPENDS}")
set(CPACK_RPM_PACKAGE_REQUIRES "python3")
set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}")
# Only add dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON is given
if(ROCM_DEP_ROCMCORE)
string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core")
string(APPEND CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ", rocm-core-asan")
string(APPEND CPACK_RPM_PACKAGE_REQUIRES ", rocm-core")
string(APPEND CPACK_RPM_ASAN_PACKAGE_REQUIRES ", rocm-core-asan")
endif()
#Component Specific Configuration/Flags
set(CPACK_DEBIAN_DEV_PACKAGE_NAME ${ROCM_SMI_PACKAGE})
set(CPACK_DEBIAN_ASAN_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-asan)
set(CPACK_DEBIAN_TESTS_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-tests)
set(CPACK_DEBIAN_STATIC_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-static-dev)
set(CPACK_RPM_DEV_PACKAGE_NAME ${ROCM_SMI_PACKAGE})
set(CPACK_RPM_ASAN_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-asan)
set(CPACK_RPM_TESTS_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-tests)
set(CPACK_RPM_STATIC_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-static-devel)
add_subdirectory("rocm_smi")
add_subdirectory("oam")
# Add tests
if(BUILD_TESTS)
set(TESTS_COMPONENT "tests")
add_subdirectory("tests/rocm_smi_test")
endif()
include(CMakePackageConfigHelpers)
set(LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
set(BIN_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}")
configure_package_config_file(
rocm_smi-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/rocm_smi-config.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rocm_smi
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/rocm_smi-config-version.cmake
VERSION "${PKG_VERSION_STR}"
COMPATIBILITY SameMajorVersion
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/rocm_smi-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/rocm_smi-config-version.cmake
DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${ROCM_SMI}
COMPONENT dev
)
# Create cmake target
# Add all targets to the build-tree export set
export(TARGETS ${ROCM_SMI_TARGET} ${OAM_TARGET}
FILE "${PROJECT_BINARY_DIR}/rocm_smi_target.cmake")
# Export the package for use from the build-tree
# (this registers the build-tree with a global CMake-registry)
export(PACKAGE rocm_smi)
# Create the rocm_smiConfig.cmake and rocm_smiConfigVersion files
# ... for the build tree
install(EXPORT rocm_smiTargets
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${ROCM_SMI}"
COMPONENT dev)
#License file
set(CPACK_RPM_PACKAGE_LICENSE "NCSA")
# install license file in share/doc/rocm-smi-lib-asan folder
if( ENABLE_ASAN_PACKAGING )
install(FILES ${CPACK_RESOURCE_FILE_LICENSE}
DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE}-asan RENAME LICENSE.txt
COMPONENT asan)
endif()
install( FILES ${CPACK_RESOURCE_FILE_LICENSE}
DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/${ROCM_SMI_PACKAGE} RENAME LICENSE.txt
COMPONENT dev)
###########################
# Packaging directives
###########################
set(CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators.")
set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default packaging prefix.")
set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
if( ENABLE_ASAN_PACKAGING )
set(CPACK_PACKAGE_NAME ${ROCM_SMI_PACKAGE}-asan)
else()
set(CPACK_PACKAGE_NAME ${ROCM_SMI_PACKAGE})
endif()
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "RocmSMILib Support <rocm-smi.support@amd.com>")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD System Management libraries")
#Make proper version for appending
#Default Value is 99999, setting it first
set(ROCM_VERSION_FOR_PACKAGE "99999")
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION})
endif()
#Debian package specific variables
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE ${CPACK_DEBIAN_PACKAGE_HOMEPAGE} CACHE STRING "https://github.com/RadeonOpenCompute/ROCm")
if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
else()
set(CPACK_DEBIAN_PACKAGE_RELEASE "local")
endif()
set(CPACK_DEBIAN_PACKAGE_SUGGESTS "sudo, libdrm-dev, libdrm-amdgpu-dev")
set(CPACK_RPM_PACKAGE_SUGGESTS "sudo, libdrm-dev, libdrm-amdgpu-dev")
## Process the Debian install/remove scripts to update the CPACK variables
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in
DEBIAN/postinst
@ONLY
FILE_PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE)
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in
DEBIAN/prerm
@ONLY
FILE_PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE)
list(APPEND CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/postinst"
"${CMAKE_CURRENT_BINARY_DIR}/DEBIAN/prerm")
# RPM package specific variables
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}" )
if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE})
set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE})
else()
set(CPACK_RPM_PACKAGE_RELEASE "local")
endif()
#Set rpm distro
if(CPACK_RPM_PACKAGE_RELEASE)
set(CPACK_RPM_PACKAGE_RELEASE_DIST ON)
endif()
# don't terminate if bytecompile of python files fails
set(CPACK_RPM_SPEC_MORE_DEFINE "%define _python_bytecompile_errors_terminate_build 0")
# Cpack converts !/usr/bin/env python3 to /usr/libexec/platform-python in RHEL8.
# prevent the BRP(buildroot policy) script from checking and modifying interpreter directives
string( APPEND CPACK_RPM_SPEC_MORE_DEFINE "\n%undefine __brp_mangle_shebangs")
# The line below doesn't currently work; it may be this issue:
# https://bugzilla.redhat.com/show_bug.cgi?id=1811358
# set (CPACK_RPM_PACKAGE_SUGGESTS "sudo")
#Prepare final version for the CPACK use
set(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}")
## Process the Rpm install/remove scripts to update the CPACK variables
configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY )
configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/preun.in" RPM/preun @ONLY )
configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY )
set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" )
set ( CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/preun" )
set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" )
#Set the names now using CPACK utility
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
include (CPack)
cpack_add_component(
dev
DISPLAY_NAME "Development"
DESCRIPTION "Development needed header files for ROCM-SMI"
DEPENDS dev)
cpack_add_component(
asan
DISPLAY_NAME "ASAN"
DESCRIPTION "ASAN libraries for the ROCM-SMI"
DEPENDS asan)
cpack_add_component_group("tests")
cpack_add_component(tests GROUP tests)
if(NOT BUILD_SHARED_LIBS)
cpack_add_component_group("static")
cpack_add_component(dev GROUP static)
endif()
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" CMake ROCm SMI (Library) [root] END ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
+3
View File
@@ -0,0 +1,3 @@
set noparent
linelength=100
filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard
+9
View File
@@ -0,0 +1,9 @@
Package: rocm_smi_lib
Architecture: any
Maintainer: Advanced Micro Devices Inc.
Priority: optional
Version: MODULE_VERSION
Depends:
Homepage: https://github.com/RadeonOpenCompute/rocm_smi_lib
Description: System Management Interface Library for ROCm
+118
View File
@@ -0,0 +1,118 @@
#!/bin/bash
do_configureLogrotate() {
local IS_SYSTEMD=0
local packageName="rocm-smi-lib"
local logPath=/var/log/rocm_smi_lib
local logFile="${logPath}/ROCm-SMI-lib.log"
local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf
mkdir -p "${logPath}"
touch "${logFile}"
chmod -R a+rw "${logPath}"
chmod a+rw "${logFile}"
command -v logrotate &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"$packageName logs (when turned on) will not rotate properly."
return
fi
if [ ! -f $logrotateConfFile ]; then
touch "${logrotateConfFile}"
chmod 644 "${logrotateConfFile}" # root r/w, all others read
# ROCm SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42
cat << EOF > "${logrotateConfFile}"
${logFile} {
su root root
hourly
missingok
notifempty
rotate 4
size 1M
copytruncate
dateext
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
}
EOF
# Fix for % S argument not found (now we escape with %%)
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sed -i s/%%/%/g "${logrotateConfFile}"
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sed -i s/OURCE//g "${logrotateConfFile}"
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
if [ -d /run/systemd/system ]; then
systemctl list-timers | grep -iq logrotate
if [ $? -eq 0 ]; then
IS_SYSTEMD=1
fi
fi
if [ "$IS_SYSTEMD" -eq 1 ]; then
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat << EOF > /lib/systemd/system/logrotate.timer
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
[Timer]
OnCalendar=
OnCalendar=hourly
AccuracySec=1m
Persistent=true
[Install]
WantedBy=timers.target
EOF
systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
else
# $IS_SYSTEMD -eq 0
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
fi
fi
fi
}
do_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf
ldconfig
fi
}
case "$1" in
( configure )
do_ldconfig
do_configureLogrotate || exit 0
;;
( abort-upgrade | abort-remove | abort-deconfigure )
echo "$1"
;;
( * )
exit 0
;;
esac
+48
View File
@@ -0,0 +1,48 @@
#!/bin/bash
rm_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
rm -f /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf
ldconfig
fi
}
rm_pyc() {
# remove pyc file generated by python
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__
}
rm_logFolder() {
rm -rf /var/log/rocm_smi_lib
}
return_logrotateToOrigConfig() {
local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf
if [ -f $logrotateConfFile ]; then
rm -rf "$logrotateConfFile"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
rm -rf /lib/systemd/system/logrotate.timer.backup
systemctl reenable --now logrotate.timer
fi
}
case "$1" in
( remove | upgrade)
rm_ldconfig
rm_pyc
rm_logFolder
return_logrotateToOrigConfig
;;
( purge )
;;
( * )
exit 0
;;
esac
@@ -0,0 +1 @@
/opt/rocm/lib
+22
View File
@@ -0,0 +1,22 @@
MIT License
Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+43
View File
@@ -0,0 +1,43 @@
# 🛠️ Maintenance Mode Notice 🛠️
Starting with ROCm 7.0, only critical bug fixes will be applied to ROCm-SMI.
For a seamless experience and continued support, please switch to [AMD-SMI](https://github.com/ROCm/amdsmi).
## Use C++ in ROCm SMI
### Device Indices
Many of the functions in the library take a "device index". The device index is a number greater than or equal to 0, and less than the number of devices detected, as determined by `rsmi_num_monitor_devices()`. The index is used to distinguish the detected devices from one another. It is important to note that a device may end up with a different index after a reboot, so an index should not be relied upon to be constant over reboots.
## Hello ROCm SMI
The only required ROCm-SMI call for any program that wants to use ROCm-SMI is the `rsmi_init()` call. This call initializes some internal data structures that will be used by subsequent ROCm-SMI calls.
When ROCm-SMI is no longer being used, `rsmi_shut_down()` should be called. This provides a way to do any releasing of resources that ROCm-SMI may have held. In many cases, this may have no effect, but may be necessary in future versions of the library.
A simple "Hello World" type program that displays the device ID of detected devices would look like this:
```c
#include <stdint.h>
#include "rocm_smi/rocm_smi.h"
int main() {
rsmi_status_t ret;
uint32_t num_devices;
uint16_t dev_id;
// We will skip return code checks for this example, but it
// is recommended to always check this as some calls may not
// apply for some devices or ROCm releases
ret = rsmi_init(0);
ret = rsmi_num_monitor_devices(&num_devices);
for (int i=0; i < num_devices; ++i) {
ret = rsmi_dev_id_get(i, &dev_id);
// dev_id holds the device ID of device i, upon a
// successful call
}
ret = rsmi_shut_down();
return 0;
}
```
+111
View File
@@ -0,0 +1,111 @@
#!/bin/bash
do_configureLogrotate() {
local IS_SYSTEMD=0
local packageName="rocm-smi-lib"
local logPath=/var/log/rocm_smi_lib
local logFile="${logPath}/ROCm-SMI-lib.log"
local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf
mkdir -p "${logPath}"
touch "${logFile}"
chmod -R a+rw "${logPath}"
chmod a+rw "${logFile}"
command -v logrotate &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"$packageName logs (when turned on) will not rotate properly."
return
fi
if [ ! -f $logrotateConfFile ]; then
touch "${logrotateConfFile}"
chmod 644 "${logrotateConfFile}" # root r/w, all others read
# ROCm SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42
cat << EOF > "${logrotateConfFile}"
${logFile} {
su root root
hourly
missingok
notifempty
rotate 4
size 1M
copytruncate
dateext
dateformat .%%Y-%%m-%%d_%H:%%M:%%S
}
EOF
# Fix for % S argument not found (now we escape with %%)
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sed -i s/%%/%/g "${logrotateConfFile}"
# workaround: remove extra 'OURCE' text
# from rocm_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sed -i s/OURCE//g "${logrotateConfFile}"
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
if [ -d /run/systemd/system ]; then
systemctl list-timers | grep -iq logrotate
if [ $? -eq 0 ]; then
IS_SYSTEMD=1
fi
fi
if [ "$IS_SYSTEMD" -eq 1 ]; then
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat << EOF > /lib/systemd/system/logrotate.timer
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
[Timer]
OnCalendar=
OnCalendar=hourly
AccuracySec=1m
Persistent=true
[Install]
WantedBy=timers.target
EOF
systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
else
# $IS_SYSTEMD -eq 0
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
fi
fi
fi
}
do_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf
ldconfig
fi
}
# post install or upgrade, $i is 1 or 2 -> do these actions
if [ "$1" -ge 1 ]; then
do_ldconfig
do_configureLogrotate || exit 0
fi
+8
View File
@@ -0,0 +1,8 @@
#!/bin/bash
# second term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm -f /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf
ldconfig
fi
+32
View File
@@ -0,0 +1,32 @@
#!/bin/bash
rm_pyc() {
# remove pyc file generated by python
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__
}
rm_logFolder() {
rm -rf /var/log/rocm_smi_lib
}
return_logrotateToOrigConfig() {
local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf
if [ -f $logrotateConfFile ]; then
rm -rf "$logrotateConfFile"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
rm -rf /lib/systemd/system/logrotate.timer.backup
systemctl reenable --now logrotate.timer
fi
}
if [ "$1" -le 1 ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm_pyc
rm_logFolder
return_logrotateToOrigConfig
fi
@@ -0,0 +1,151 @@
# This module provides common functions used for building
# and packaging ROCm projects
option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" ON)
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
function(generic_add_rocm)
set(ROCM_DIR
"/opt/rocm"
CACHE STRING "ROCm directory.")
if(DEFINED ENV{ROCM_RPATH} AND NOT DEFINED LIB_RUNPATH)
set(LIB_RUNPATH "\$ORIGIN:\$ORIGIN/../lib:\$ORIGIN/../lib64" PARENT_SCOPE)
endif()
set(CMAKE_INSTALL_PREFIX ${ROCM_DIR} CACHE STRING "Default installation directory.")
set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix.")
# add package search paths
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} /usr/local PARENT_SCOPE)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/lib64 /usr/lib/x86_64-linux-gnu PARENT_SCOPE)
endfunction()
function(generic_package)
# Used by test and example CMakeLists
set(SHARE_INSTALL_PREFIX "share/${CMAKE_PROJECT_NAME}" CACHE STRING "Tests and Example install directory")
if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0)
message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION})
message(FATAL_ERROR "Require at least gcc-5.4.0")
endif()
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" PARENT_SCOPE)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG" PARENT_SCOPE)
endif()
# Add address sanitizer
# derived from:
# https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/blob/e176056061bf11fdd98b58dd57deb4ac5625844d/amdocl/CMakeLists.txt#L27
if(${ADDRESS_SANITIZER})
set(ASAN_COMPILER_FLAGS "-fno-omit-frame-pointer -fsanitize=address")
set(ASAN_LINKER_FLAGS "-fsanitize=address")
if(BUILD_SHARED_LIBS)
set(ASAN_COMPILER_FLAGS "${ASAN_COMPILER_FLAGS} -shared-libsan")
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -shared-libsan")
else()
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -static-libsan")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ASAN_COMPILER_FLAGS}" PARENT_SCOPE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ASAN_COMPILER_FLAGS}" PARENT_SCOPE)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LINKER_FLAGS}" PARENT_SCOPE)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LINKER_FLAGS}" PARENT_SCOPE)
else()
## Security breach mitigation flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFORTIFY_SOURCE=2 -fstack-protector-all -Wcast-align" PARENT_SCOPE)
## More security breach mitigation flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-z,noexecstack -Wl,-znoexecheap -Wl,-z,relro" PARENT_SCOPE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtrampolines -Wl,-z,now -fPIE" PARENT_SCOPE)
endif()
# Clang does not set the build-id
# similar to if(NOT CMAKE_COMPILER_IS_GNUCC)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--build-id=sha1" PARENT_SCOPE)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--build-id=sha1" PARENT_SCOPE)
endif()
# configure packaging
# cpack version is populated with CMAKE_PROJECT_VERSION implicitly
set(CPACK_PACKAGE_NAME
${CMAKE_PROJECT_NAME}
CACHE STRING "")
set(CPACK_PACKAGE_VENDOR
"Advanced Micro Devices, Inc."
CACHE STRING "")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY
"Placeholder Tool"
CACHE STRING "")
set(CPACK_PACKAGE_DESCRIPTION
"This package contains the AMD ${CPACK_PACKAGE_DESCRIPTION_SUMMARY}."
CACHE STRING "")
set(CPACK_PACKAGING_INSTALL_PREFIX
"${CMAKE_INSTALL_PREFIX}"
CACHE STRING "Default packaging prefix.")
set(CPACK_RESOURCE_FILE_LICENSE
"${CMAKE_CURRENT_SOURCE_DIR}/License.txt"
CACHE STRING "")
set(CPACK_RPM_PACKAGE_LICENSE
"MIT"
CACHE STRING "")
set(CPACK_GENERATOR
"DEB;RPM"
CACHE STRING "Default packaging generators.")
set(CPACK_DEB_COMPONENT_INSTALL ON PARENT_SCOPE)
set(CPACK_RPM_COMPONENT_INSTALL ON PARENT_SCOPE)
mark_as_advanced(CPACK_PACKAGE_NAME CPACK_PACKAGE_VENDOR CPACK_PACKAGE_CONTACT CPACK_PACKAGE_DESCRIPTION_SUMMARY
CPACK_PACKAGE_DESCRIPTION CPACK_RESOURCE_FILE_LICENSE CPACK_RPM_PACKAGE_LICENSE CPACK_GENERATOR)
# Debian package specific variables
if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} PARENT_SCOPE)
else()
set(CPACK_DEBIAN_PACKAGE_RELEASE "local" PARENT_SCOPE)
endif()
message("Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}")
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" PARENT_SCOPE)
# RPM package specific variables
if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE})
set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} PARENT_SCOPE)
else()
set(CPACK_RPM_PACKAGE_RELEASE "local" PARENT_SCOPE)
endif()
message("Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}")
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT" PARENT_SCOPE)
set(CPACK_RPM_PACKAGE_AUTOREQ 0 PARENT_SCOPE)
set(CPACK_RPM_PACKAGE_AUTOPROV 0 PARENT_SCOPE)
list(
APPEND
CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
"/lib"
"/usr/sbin"
"/lib/systemd"
"/lib/systemd/system"
"/usr"
"/opt")
# PACKAGE-tests need PACKAGE
set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS "${CPACK_PACKAGE_NAME}" PARENT_SCOPE)
set(CPACK_RPM_TESTS_PACKAGE_REQUIRES "${CPACK_PACKAGE_NAME}" PARENT_SCOPE)
# Treat runtime group as package base.
# Without it - the base package would be named 'rdc-runtime'
# resulting in rdc-runtime*.deb and rdc-runtime*.rpm
set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}" PARENT_SCOPE)
set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}" PARENT_SCOPE)
endfunction()
# this function goes after 'include(CPack)'
function(generic_package_post)
# PACKAGE package, no postfix
cpack_add_component_group("runtime")
cpack_add_component(dev GROUP runtime)
cpack_add_component(unspecified GROUP runtime)
# PACKAGE-tests package, -tests postfix
cpack_add_component_group("tests")
cpack_add_component(tests GROUP tests)
endfunction()
+166
View File
@@ -0,0 +1,166 @@
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and#or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and#or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
## Parses the VERSION_STRING variable and places
## the first, second and third number values in
## the major, minor and patch variables.
function( parse_version VERSION_STRING )
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
endif ()
string ( REGEX MATCHALL "[0-9]+" VERSIONS ${VERSION_STRING} )
list ( LENGTH VERSIONS VERSION_COUNT )
if ( ${VERSION_COUNT} GREATER 0)
list ( GET VERSIONS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${MAJOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 1 )
list ( GET VERSIONS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 2 )
list ( GET VERSIONS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
endif ()
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE )
endfunction ()
## Gets the current version of the repository
## using versioning tags and git describe.
## Passes back a packaging version string
## and a library version string.
function(get_version_from_tag DEFAULT_VERSION_STRING VERSION_PREFIX GIT)
parse_version ( ${DEFAULT_VERSION_STRING} )
if ( GIT )
execute_process ( COMMAND git describe --tags --dirty --long --match ${VERSION_PREFIX}-[0-9.]*
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} )
endif ()
endif ()
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
endfunction()
function(num_change_since_prev_pkg VERSION_PREFIX)
find_program(get_commits NAMES version_util.sh
PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules)
if (get_commits)
execute_process( COMMAND ${get_commits} -c ${VERSION_PREFIX}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE NUM_COMMITS
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
set(NUM_COMMITS "${NUM_COMMITS}" PARENT_SCOPE )
if ( ${RESULT} EQUAL 0 )
message("${NUM_COMMITS} were found since previous release")
else()
message("Unable to determine number of commits since previous release")
endif()
else()
message("WARNING: Didn't find version_util.sh")
set(NUM_COMMITS "unknown" PARENT_SCOPE )
endif()
endfunction()
function(get_package_version_number DEFAULT_VERSION_STRING VERSION_PREFIX GIT)
get_version_from_tag(${DEFAULT_VERSION_STRING} ${VERSION_PREFIX} GIT)
num_change_since_prev_pkg(${VERSION_PREFIX})
set(PKG_VERSION_STR "${VERSION_STRING}.${NUM_COMMITS}")
if (DEFINED ENV{ROCM_BUILD_ID})
set(VERSION_ID $ENV{ROCM_BUILD_ID})
else()
set(VERSION_ID "local-build-0")
endif()
set(PKG_VERSION_STR "${PKG_VERSION_STR}-${VERSION_ID}")
if (GIT)
execute_process(COMMAND git rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE VERSION_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if( ${RESULT} EQUAL 0 )
# Check for dirty workspace.
execute_process(COMMAND git diff --quiet
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE RESULT )
if(${RESULT} EQUAL 1)
set(VERSION_HASH "${VERSION_HASH}-dirty")
endif()
else()
set( VERSION_HASH "unknown" )
endif()
else()
set( VERSION_HASH "unknown" )
endif()
set(PKG_VERSION_STR "${PKG_VERSION_STR}-${VERSION_HASH}")
set(PKG_VERSION_STR ${PKG_VERSION_STR} PARENT_SCOPE)
set(PKG_VERSION_HASH ${VERSION_HASH} PARENT_SCOPE)
set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR} PARENT_SCOPE)
set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR} PARENT_SCOPE)
set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH} PARENT_SCOPE)
endfunction()
+40
View File
@@ -0,0 +1,40 @@
#!/bin/bash
# Handle commandline args
while [ "$1" != "" ]; do
case $1 in
-c ) # Commits since prevous tag
TARGET="count" ;;
* )
TARGET="count"
break ;;
esac
shift 1
done
TAG_PREFIX=$1
reg_ex="${TAG_PREFIX}*"
commits_since_last_tag() {
TAG_ARR=(`git tag --sort=committerdate -l ${reg_ex} | tail -2`)
PREVIOUS_TAG=${TAG_ARR[0]}
CURRENT_TAG=${TAG_ARR[1]}
PREV_CMT_NUM=`git rev-list --count $PREVIOUS_TAG`
CURR_CMT_NUM=`git rev-list --count $CURRENT_TAG`
# Commits since prevous tag:
if [[ -z $PREV_CMT_NUM || -z $CURR_CMT_NUM ]]; then
let NUM_COMMITS="0"
else
let NUM_COMMITS="${CURR_CMT_NUM}-${PREV_CMT_NUM}"
fi
echo $NUM_COMMITS
}
case $TARGET in
count) commits_since_last_tag ;;
*) die "Invalid target $target" ;;
esac
exit 0
+11
View File
@@ -0,0 +1,11 @@
!.sphinx/
!.doxygen/
/_build/
/_doxygen/
/_images/
/_static/
/_templates/
/html/
/latex/
404.md
data/AMD-404.png
+60
View File
@@ -0,0 +1,60 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import re
import pathlib
import shutil
import sys
from rocm_docs import ROCmDocs
# We need to add the location of the rocrand Python module to the PATH
# in order to build the documentation of that module
docs_dir_path = pathlib.Path(__file__).parent
python_dir_path = docs_dir_path.parent / 'python_smi_tools'
sys.path.append(str(python_dir_path))
with open('../CMakeLists.txt', encoding='utf-8') as f:
match = re.search(r'get_package_version_number\(\"?([0-9.]+)[^0-9.]+', f.read())
if not match:
raise ValueError("VERSION not found!")
version_number = match[1]
left_nav_title = f"ROCm SMI LIB {version_number} Documentation"
shutil.copy2('../CHANGELOG.md','./CHANGELOG.md')
# for PDF output on Read the Docs
project = "ROCm SMI LIB Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
external_toc_path = "./sphinx/_toc.yml"
exclude_patterns = ['CHANGELOG.md']
docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.enable_api_reference()
docs_core.setup()
external_projects_current_project = "rocm_smi_lib"
suppress_warnings = ["etoc.toctree"]
for sphinx_var in ROCmDocs.SPHINX_VARS:
globals()[sphinx_var] = getattr(docs_core, sphinx_var)
extensions += ['sphinx.ext.mathjax']
# Necessary to remove the header comments from the rocm_smi module
def remove_module_docstring(app, what, name, obj, options, lines):
if what == "module":
del lines[:]
def setup(app):
app.connect("autodoc-process-docstring", remove_module_docstring)
@@ -0,0 +1,2 @@
html/
xml/
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,57 @@
.. meta::
:description: Learn about using the ROCm SMI library with C++.
:keywords: install, SMI, library, api, cpp, system management interface
***********************
Using ROCm SMI with C++
***********************
Installation
============
Follow the installation procedure for rocm_smi_lib. Refer to the :doc:`installation section <../install/install>`.
.. note::
``hipcc`` and other compilers will not automatically link in the ``librocm_smi64`` dynamic library. To ensure the
``librocm_smi64.so`` can be located, you must either set the ``LD_LIBRARY_PATH`` environment variable to the
directory containing ``librocm_smi64.so`` (usually ``/opt/rocm/lib``) or pass the ``-lrocm_smi64`` flag to the compiler.
Device indices
==============
Many of the functions in the library take a "device index". The device index is a number greater than or equal to 0, and less than the number of devices detected, as determined by `rsmi_num_monitor_devices()`. The index is used to distinguish the detected devices from one another. It is important to note that a device may end up with a different index after a reboot, so an index should not be relied upon to be constant over reboots.
Hello ROCm SMI
================
The only required ROCm-SMI call for any program that wants to use ROCm-SMI is the ``rsmi_init()`` call. This call initializes some internal data structures that will be used by subsequent ROCm-SMI calls.
When ROCm-SMI is no longer being used, ``rsmi_shut_down()`` should be called. This provides a way to do any releasing of resources that ROCm-SMI may have held. In many cases, this may have no effect, but may be necessary in future versions of the library.
A simple "Hello World" type program that displays the device ID of detected devices would look like this:
.. code-block:: c
#include <stdint.h>
#include "rocm_smi/rocm_smi.h"
int main() {
rsmi_status_t ret;
uint32_t num_devices;
uint16_t dev_id;
// We will skip return code checks for this example, but it
// is recommended to always check this as some calls may not
// apply for some devices or ROCm releases
ret = rsmi_init(0);
ret = rsmi_num_monitor_devices(&num_devices);
for (int i=0; i < num_devices; ++i) {
ret = rsmi_dev_id_get(i, &dev_id);
// dev_id holds the device ID of device i, upon a
// successful call
}
ret = rsmi_shut_down();
return 0;
}
@@ -0,0 +1,519 @@
.. meta::
:description: Learn about using the ROCm SMI library with Python.
:keywords: install, SMI, library, api, python, system management interface
**************************
Using ROCm SMI with Python
**************************
This tool acts as a command line interface for manipulating and monitoring the amdgpu kernel, and is intended to replace and deprecate the existing ``rocm_smi.py`` CLI tool.
It uses Ctypes to call the ``rocm_smi_lib`` API.
To use ROCm SMI, you must have ROCm SMI library (``librocm_smi64``) installed and should have at least one AMD GPU with the ROCm driver installed.
Installation
============
Follow the installation procedure for ``rocm_smi_lib``. Refer to :doc:`../install/install`.
.. note::
``hipcc`` and other compilers will not automatically link in the ``librocm_smi64`` dynamic library. To ensure the
``librocm_smi64.so`` can be located, you must either set the ``LD_LIBRARY_PATH`` environment variable to the
directory containing ``librocm_smi64.so`` (usually ``/opt/rocm/lib`` or pass the ``-lrocm_smi64`` flag to the compiler.
Version
=======
The SMI will report two "versions": the ``ROCM-SMI`` version and the ``ROCM-SMI-LIB`` version.
- ``ROCM-SMI`` version is the CLI/tool version number with commit ID appended after + sign.
- ``ROCM-SMI-LIB`` version is the library package version number.
.. code-block:: shell-session
ROCM-SMI version: 2.0.0+8e78352
ROCM-SMI-LIB version: 6.1.0
Usage
=====
For detailed and up to date usage information, consult the help.
.. code-block:: shell-session
/opt/rocm/bin/rocm-smi -h
The following is the output from the ``-h`` flag:
.. code-block:: shell-session
$ /opt/rocm/bin/rocm-smi -h
usage: rocm-smi [-h] [-V] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a] [-i] [-v] [-e [EVENT [EVENT ...]]]
[--showdriverversion] [--showtempgraph] [--showfwinfo [BLOCK [BLOCK ...]]] [--showmclkrange]
[--showmemvendor] [--showsclkrange] [--showproductname] [--showserial] [--showuniqueid]
[--showvoltagerange] [--showbus] [--showpagesinfo] [--showpendingpages] [--showretiredpages]
[--showunreservablepages] [-f] [-P] [-t] [-u] [--showmemuse] [--showvoltage] [-b] [-c] [-g] [-l] [-M]
[-m] [-o] [-p] [-S] [-s] [--showmeminfo TYPE [TYPE ...]] [--showpids [VERBOSE]]
[--showpidgpus [SHOWPIDGPUS [SHOWPIDGPUS ...]]] [--showreplaycount]
[--showrasinfo [SHOWRASINFO [SHOWRASINFO ...]]] [--showvc] [--showxgmierr] [--showtopo]
[--showtopoaccess] [--showtopoweight] [--showtopohops] [--showtopotype] [--showtoponuma]
[--showenergycounter] [--shownodesbw] [--showcomputepartition] [--showmemorypartition] [-r]
[--resetfans] [--resetprofile] [--resetpoweroverdrive] [--resetxgmierr] [--resetperfdeterminism]
[--resetcomputepartition] [--resetmemorypartition] [--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
[--setmclk LEVEL [LEVEL ...]] [--setpcie LEVEL [LEVEL ...]] [--setslevel SCLKLEVEL SCLK SVOLT]
[--setmlevel MCLKLEVEL MCLK MVOLT] [--setvc POINT SCLK SVOLT] [--setsrange SCLKMIN SCLKMAX]
[--setextremum min|max sclk|mclk CLK] [--setmrange MCLKMIN MCLKMAX] [--setfan LEVEL]
[--setperflevel LEVEL] [--setoverdrive %] [--setmemoverdrive %] [--setpoweroverdrive WATTS]
[--setprofile SETPROFILE] [--setperfdeterminism SCLK]
[--setcomputepartition {CPX,SPX,DPX,TPX,QPX,cpx,spx,dpx,tpx,qpx}]
[--setmemorypartition {NPS1,NPS2,NPS4,NPS8,nps1,nps2,nps4,nps8}] [--rasenable BLOCK ERRTYPE]
[--rasdisable BLOCK ERRTYPE] [--rasinject BLOCK] [--gpureset] [--load FILE | --save FILE]
[--autorespond RESPONSE] [--loglevel LEVEL] [--json] [--csv]
AMD ROCm System Management Interface | ROCM-SMI version: 2.0.0+8e78352
optional arguments:
-h, --help show this help message and exit
--gpureset Reset specified GPU (One GPU must be specified)
--load FILE Load Clock, Fan, Performance and Profile settings
from FILE
--save FILE Save Clock, Fan, Performance and Profile settings to
FILE
-V, --version Show version information
-d DEVICE [DEVICE ...], --device DEVICE [DEVICE ...] Execute command on specified device
Display Options:
--alldevices
--showhw Show Hardware details
-a, --showallinfo Show Temperature, Fan and Clock values
Topology:
-i, --showid Show DEVICE ID
-v, --showvbios Show VBIOS version
-e [EVENT [EVENT ...]], --showevents [EVENT [EVENT ...]] Show event list
--showdriverversion Show kernel driver version
--showtempgraph Show Temperature Graph
--showfwinfo [BLOCK [BLOCK ...]] Show FW information
--showmclkrange Show mclk range
--showmemvendor Show GPU memory vendor
--showsclkrange Show sclk range
--showproductname Show SKU/Vendor name
--showserial Show GPU's Serial Number
--showuniqueid Show GPU's Unique ID
--showvoltagerange Show voltage range
--showbus Show PCI bus number
Pages information:
--showpagesinfo Show retired, pending and unreservable pages
--showpendingpages Show pending retired pages
--showretiredpages Show retired pages
--showunreservablepages Show unreservable pages
Hardware-related information:
-f, --showfan Show current fan speed
-P, --showpower Show current Average or Socket Graphics Package Power
Consumption
-t, --showtemp Show current temperature
-u, --showuse Show current GPU use
--showmemuse Show current GPU memory used
--showvoltage Show current GPU voltage
Software-related/controlled information:
-b, --showbw Show estimated PCIe use
-c, --showclocks Show current clock frequencies
-g, --showgpuclocks Show current GPU clock frequencies
-l, --showprofile Show Compute Profile attributes
-M, --showmaxpower Show maximum graphics package power this GPU will
consume
-m, --showmemoverdrive Show current GPU Memory Clock OverDrive level
-o, --showoverdrive Show current GPU Clock OverDrive level
-p, --showperflevel Show current DPM Performance Level
-S, --showclkvolt Show supported GPU and Memory Clocks and Voltages
-s, --showclkfrq Show supported GPU and Memory Clock
--showmeminfo TYPE [TYPE ...] Show Memory usage information for given block(s) TYPE
--showpids [VERBOSE] Show current running KFD PIDs (pass details to
VERBOSE for detailed information)
--showpidgpus [SHOWPIDGPUS [SHOWPIDGPUS ...]] Show GPUs used by specified KFD PIDs (all if no arg
given)
--showreplaycount Show PCIe Replay Count
--showrasinfo [SHOWRASINFO [SHOWRASINFO ...]] Show RAS enablement information and error counts for
the specified block(s) (all if no arg given)
--showvc Show voltage curve
--showxgmierr Show XGMI error information since last read
--showtopo Show hardware topology information
--showtopoaccess Shows the link accessibility between GPUs
--showtopoweight Shows the relative weight between GPUs
--showtopohops Shows the number of hops between GPUs
--showtopotype Shows the link type between GPUs
--showtoponuma Shows the numa nodes
--showenergycounter Energy accumulator that stores amount of energy
consumed
--shownodesbw Shows the numa nodes
--showcomputepartition Shows current compute partitioning
--showmemorypartition Shows current memory partition
Set options:
--setclock TYPE LEVEL Set Clock Frequency Level(s) for specified clock
(requires manual Perf level)
--setsclk LEVEL [LEVEL ...] Set GPU Clock Frequency Level(s) (requires manual
Perf level)
--setmclk LEVEL [LEVEL ...] Set GPU Memory Clock Frequency Level(s) (requires
manual Perf level)
--setpcie LEVEL [LEVEL ...] Set PCIE Clock Frequency Level(s) (requires manual
Perf level)
--setslevel SCLKLEVEL SCLK SVOLT Change GPU Clock frequency (MHz) and Voltage (mV) for
a specific Level
--setmlevel MCLKLEVEL MCLK MVOLT Change GPU Memory clock frequency (MHz) and Voltage
for (mV) a specific Level
--setvc POINT SCLK SVOLT Change SCLK Voltage Curve (MHz mV) for a specific
point
--setsrange SCLKMIN SCLKMAX Set min and max SCLK speed
--setextremum min|max sclk|mclk CLK Set min/max of SCLK/MCLK speed
--setmrange MCLKMIN MCLKMAX Set min and max MCLK speed
--setfan LEVEL Set GPU Fan Speed (Level or %)
--setperflevel LEVEL Set Performance Level
--setoverdrive % Set GPU OverDrive level (requires manual|high Perf
level)
--setmemoverdrive % Set GPU Memory Overclock OverDrive level (requires
manual|high Perf level)
--setpoweroverdrive WATTS Set the maximum GPU power using Power OverDrive in
Watts
--setprofile SETPROFILE Specify Power Profile level (#) or a quoted string of
CUSTOM Profile attributes "# # # #..." (requires
manual Perf level)
--setperfdeterminism SCLK Set clock frequency limit to get minimal performance
variation
--setcomputepartition {CPX,SPX,DPX,TPX,QPX,cpx,spx,dpx,tpx,qpx} Set compute partition
--setmemorypartition {NPS1,NPS2,NPS4,NPS8,nps1,nps2,nps4,nps8} Set memory partition
--rasenable BLOCK ERRTYPE Enable RAS for specified block and error type
--rasdisable BLOCK ERRTYPE Disable RAS for specified block and error type
--rasinject BLOCK Inject RAS poison for specified block (ONLY WORKS ON
UNSECURE BOARDS)
Reset options:
-r, --resetclocks Reset clocks and OverDrive to default
--resetfans Reset fans to automatic (driver) control
--resetprofile Reset Power Profile back to default
--resetpoweroverdrive Set the maximum GPU power back to the device deafult
state
--resetxgmierr Reset XGMI error count
--resetperfdeterminism Disable performance determinism
--resetcomputepartition Resets to boot compute partition state
--resetmemorypartition Resets to boot memory partition state
Auto-response options:
--autorespond RESPONSE Response to automatically provide for all prompts
(NOT RECOMMENDED)
Output options:
--loglevel LEVEL How much output will be printed for what program is
doing, one of debug/info/warning/error/critical
--json Print output in JSON format
--csv Print output in CSV format
Detailed option descriptions
============================
--setextremum <[min or max] [sclk or mclk] [value in MHz to set to]>
Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
.. code-block:: shell-session
$ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100
============================ ROCm System Management Interface ============================
******WARNING******
Operating your AMD GPU outside of official AMD specifications or outside of
factory settings, including but not limited to the conducting of overclocking,
over-volting or under-volting (including use of this interface software,
even if such software has been directly or indirectly provided by AMD or otherwise
affiliated in any way with AMD), may cause damage to your AMD GPU, system components
and/or result in system failure, as well as cause other problems.
DAMAGES CAUSED BY USE OF YOUR AMD GPU OUTSIDE OF OFFICIAL AMD SPECIFICATIONS OR
OUTSIDE OF FACTORY SETTINGS ARE NOT COVERED UNDER ANY AMD PRODUCT WARRANTY AND
MAY NOT BE COVERED BY YOUR BOARD OR SYSTEM MANUFACTURER'S WARRANTY.
Use this utility with caution.
Do you accept these terms? [y/N] y
================================ Set Valid sclk Extremum =================================
GPU[0] : Successfully set max sclk to 2100(MHz)
GPU[1] : Successfully set max sclk to 2100(MHz)
GPU[2] : Successfully set max sclk to 2100(MHz)
GPU[3] : Successfully set max sclk to 2100(MHz)
================================== End of ROCm SMI Log ===================================
--setsclk, --setmclk <# [# # ...]>
This allows you to set a mask for the levels. For example, if a GPU has 8 clock levels,
you can set a mask to use levels 0, 5, 6 and 7 with --setsclk 0 5 6 7 . This will only
use the base level, and the top 3 clock levels. This will allow you to keep the GPU at
base level when there is no GPU load, and the top 3 levels when the GPU load increases.
.. note::
The clock levels will change dynamically based on GPU load based on the default
Compute and Graphics profiles. The thresholds and delays for a custom mask cannot
be controlled through the SMI tool.
This flag automatically sets the Performance Level to ``manual`` as the mask is not
applied when the Performance level is set to ``auto``.
--setfan <LEVEL>
This sets the fan speed to a value ranging from 0 to maxlevel, or from 0%-100%.
If the level ends with a %, the fan speed is calculated as pct*maxlevel/100 (maxlevel is usually 255, but is determined by the ASIC).
.. note::
While the hardware is usually capable of overriding this value when required, it is
recommended to not set the fan level lower than the default value for extended periods
of time.
--setperflevel <LEVEL>
This lets you use the pre-defined Performance Level values for clocks and power profile, which can include:
* ``auto`` (Automatically change values based on GPU workload)
* ``low`` (Keep values low, regardless of workload)
* ``high`` (Keep values high, regardless of workload)
* ``manual`` (Only use values defined by ``--setsclk`` and ``--setmclk``)
--setoverdrive, --setmemoverdrive <#>
.. warning::
DEPRECATED IN NEWER KERNEL VERSIONS. Use ``--setslevel`` or ``--setmlevel`` instead.
This sets the percentage above maximum for the max Performance Level.
For example, ``--setoverdrive`` 20 will increase the top sclk level by 20%, similarly
``--setmemoverdrive`` 20 will increase the top mclk level by 20%. Thus if the maximum
clock level is 1000MHz, then ``--setoverdrive`` 20 will increase the maximum clock to 1200MHz.
.. note::
This option can be used in conjunction with the ``--setsclk``/``--setmclk`` mask.
Operating the GPU outside of specifications can cause irreparable damage to your hardware
Observe the warning displayed when using this option.
This flag automatically sets the clock to the highest level, as only the highest level is
increased by the :ref:`OverDrive <overdrive-settings>` value.
--setpoweroverdrive, --resetpoweroverdrive <#>
This allows users to change the maximum power available to a GPU package.
The input value is in Watts. This limit is enforced by the hardware, and
some cards allow users to set it to a higher value than the default that
ships with the GPU. This Power OverDrive mode allows the GPU to run at
higher frequencies for longer periods of time, though this may mean the
GPU uses more power than it is allowed to use per power supply
specifications. Each GPU has a model-specific maximum Power OverDrive that
is will take; attempting to set a higher limit than that will cause this
command to fail.
.. note::
Operating the GPU outside of specifications can cause irreparable damage to your hardware.
Observe the warning displayed when using this option.
--setprofile <SETPROFILE>
The Compute Profile accepts 1 or n parameters, either the Profile to select (see ``--showprofile`` for a list
of preset Power Profiles) or a quoted string of values for the CUSTOM profile.
These values can vary based on the ASIC, and may include:
.. code-block:: shell-session
| Setting | Description |
|---------------------|--------------------------------------------------------------------------------------------|
| SCLK_PROFILE_ENABLE | Whether or not to apply the 3 following SCLK settings (0=disable,1=enable) |
| | **NOTE: This is a hidden field. If set to 0, the following 3 values are displayed as '-** |
| SCLK_UP_HYST | Delay before sclk is increased (in milliseconds) |
| SCLK_DOWN_HYST | Delay before sclk is decresed (in milliseconds) |
| SCLK_ACTIVE_LEVEL | Workload required before sclk levels change (in %) |
| MCLK_PROFILE_ENABLE | Whether or not to apply the 3 following MCLK settings (0=disable,1=enable) |
| | **NOTE: This is a hidden field. If set to 0, the following 3 values are displayed as '-'** |
| MCLK_UP_HYST | Delay before mclk is increased (in milliseconds) |
| MCLK_DOWN_HYST | Delay before mclk is decresed (in milliseconds) |
| MCLK_ACTIVE_LEVEL | Workload required before mclk levels change (in %) |
Other settings
--------------
.. code-block:: shell-session
| Setting | Description |
|------------------|---------------------------------------------------------------------------|
| BUSY_SET_POINT | Threshold for raw activity level before levels change |
| FPS | Frames Per Second |
| USE_RLC_BUSY | When set to 1, DPM is switched up as long as RLC busy message is received |
| MIN_ACTIVE_LEVEL | Workload required before levels change (in %) |
.. note::
When a compute queue is detected, the COMPUTE Power Profile values will be automatically
applied to the system, provided that the Perf Level is set to ``auto``.
The CUSTOM Power Profile is only applied when the Performance Level is set to ``manual``
so using this flag will automatically set the performance level to ``manual``.
It is not possible to modify the non-CUSTOM Profiles. These are hard-coded by the kernel.
-P, --showpower
Show average or instantaneous socket graphics package power consumption.
"Graphics Package" refers to the GPU plus any HBM (High-Bandwidth memory) modules, if present.
-M, --showmaxpower
Show the maximum Graphics Package power that the GPU will attempt to consume.
This limit is enforced by the hardware.
--loglevel
This will allow the user to set a logging level for the SMI's actions. Currently this is
only implemented for ``sysfs`` writes, but can easily be expanded upon in the future to log
other things from the SMI.
--showmeminfo
This allows the user to see the amount of used and total memory for a given block (``vram``,
``vis_vram``, ``gtt``). It returns the number of bytes used and total number of bytes for each block
``all`` can be passed as a field to return all blocks, otherwise a quoted-string is used for
multiple values (such as ``vram vis_vram``).
``vram`` refers to the Video RAM, or graphics memory, on the specified device
``vis_vram`` refers to Visible VRAM, which is the CPU-accessible video memory on the device
``gtt`` refers to the Graphics Translation Table
-b, --showbw
This shows an approximation of the number of bytes received and sent by the GPU over
the last second through the PCIe bus. Note that this will not work for APUs since data for
the GPU portion of the APU goes through the memory fabric and does not 'enter/exit'
the chip via the PCIe interface, thus no accesses are generated, and the performance
counters can't count accesses that are not generated.
.. note::
It is not possible to easily grab the size of every packet that is transmitted
in real time, so the kernel estimates the bandwidth by taking the maximum payload size (mps),
which is the max size that a PCIe packet can be. and multiplies it by the number of packets
received and sent. This means that the SMI will report the maximum estimated bandwidth,
the actual usage could (and likely will be) less.
--showrasinfo
This shows the RAS information for a given block. This includes enablement of the block
(currently GFX, SDMA and UMC are the only supported blocks) and the number of errors.
* ``ue`` - Uncorrectable errors
* ``ce`` - Correctable errors
Clock type descriptions
=======================
.. code-block:: shell-session
| Clock type | Description |
| ---------- | -------------------------------------------------------------------------------------------- |
| DCEFCLK | DCE (Display) |
| FCLK | Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe |
| SCLK | GFXCLK (Graphics core) |
| | **Note - SOCCLK split from SCLK as of Vega10. Pre-Vega10 they were both controlled by SCLK** |
| MCLK | GPU Memory (VRAM) |
| PCLK | PCIe bus |
| | **Note - This gives 2 speeds, PCIe Gen1 x1 and the highest available based on the hardware** |
| SOCCLK | System clock (VG10 and later) - Data Fabric (DF), MM HUB, AT HUB, SYSTEM HUB, OSS, DFD |
| | **Note - DF split from SOCCLK as of Vega20. Pre-Vega20 they were both controlled by SOCCLK** |
--gpureset
This flag will attempt to reset the GPU for a specified device. This will invoke the GPU reset through
the kernel debugfs file ``amdgpu_gpu_recover``. Note that GPU reset will not always work, depending on the
manner in which the GPU is hung.
--showdriverversion
This flag will print out the AMDGPU module version for amdgpu-pro or ROCm kernels. For other kernels,
it will simply print out the name of the kernel (``uname -r``).
--showserial
This flag will print out the serial number for the graphics card.
.. note::
This is currently only supported on Vega20 server cards that support it. Consumer cards and
cards older than Vega20 will not support this feature.
--showproductname
This uses the ``pci.ids`` file to print out more information regarding the GPUs on the system.
``update-pciids`` may need to be executed on the machine to get the latest PCI ID snapshot,
as certain newer GPUs will not be present in the stock ``pci.ids`` file, and the file may even
be absent on certain OS installation types.
--showpagesinfo, --showretiredpages, --showpendingpages, --showunreservablepages
These flags display the different "bad pages" as reported by the kernel. The three
types of pages are:
* Retired pages (reserved pages) - These pages are reserved and are unable to be used.
* Pending pages - These pages are pending for reservation, and will be reserved/retired.
* Unreservable pages - These pages are not reservable for some reason.
--showmemuse, --showuse, --showmeminfo
``--showuse`` and ``--showmemuse`` are used to indicate how busy the respective blocks are. For
example, for ``--showuse (gpu_busy_percent sysfs file)``, the SMU samples every ms or so to see
if any GPU block (RLC, MEC, PFP, CP) is busy. If so, that's 1 (or high). If not, that's 0 (low).
If we have 5 high and 5 low samples, that means 50% utilization (50% GPU busy, or 50% GPU use).
The windows and sampling vary from generation to generation, but that is how GPU and VRAM use
is calculated in a generic sense.
``--showmeminfo`` (and VRAM% in concise output) will show the amount of VRAM used (visible, total, GTT),
as well as the total available for those partitions. The percentage shown there indicates the
amount of used memory in terms of current allocations.
.. _overdrive-settings:
OverDrive settings
==================
Enabling OverDrive requires both a card that support OverDrive and a driver parameter that enables its use.
Because OverDrive features can damage your card, most workstation and server GPUs cannot use OverDrive.
Consumer GPUs that can use OverDrive must enable this feature by setting bit 14 in the amdgpu driver's
ppfeaturemask module parameter
For OverDrive functionality, the OverDrive bit (bit 14) must be enabled (by default, the
OverDrive bit is disabled on the ROCK and upstream kernels). This can be done by setting
amdgpu.ppfeaturemask accordingly in the kernel parameters, or by changing the default value
inside amdgpu_drv.c (if building your own kernel).
As an example, if the ``ppfeaturemask`` is set to ``0xffffbfff`` (``11111111111111111011111111111111``),
then enabling the OverDrive bit would make it ``0xffffffff`` (``11111111111111111111111111111111``).
These are the flags that require OverDrive functionality to be enabled for the flag to work:
* ``--showclkvolt``
* ``--showvoltagerange``
* ``--showvc``
* ``--showsclkrange``
* ``--showmclkrange``
* ``--setslevel``
* ``--setmlevel``
* ``--setoverdrive``
* ``--setpoweroverdrive``
* ``--resetpoweroverdrive``
* ``--setvc``
* ``--setsrange``
* ``--setmrange``
+53
View File
@@ -0,0 +1,53 @@
.. meta::
:description: ROCm SMI
:keywords: install, SMI, library, api, AMD, ROCm
****************************************************
ROCm System Management Interface (ROCm SMI) library
****************************************************
The ROCm SMI library, is part of the ROCm software stack. It is a C++ library for Linux that provides a user space interface for applications to monitor and control GPU applications.
For more information, refer to `<https://github.com/ROCm/rocm_smi_lib>`__.
.. note::
The AMD System Management Interface Library (AMD SMI) is a successor to ROCm SMI. It is a unified system management
interface tool that provides a user space interface for applications to monitor and control GPU applications and gives
users the ability to query information about drivers and GPUs on the system.
AMD SMI will replace ``rocm_smi_lib`` over time. We recommend that users transition to AMD SMI.
For more information, refer to `<https://github.com/ROCm/amdsmi>`__ and the :doc:`AMD SMI documentation <amdsmi:index>`.
.. grid:: 2
:gutter: 3
.. grid-item-card:: Install
* :doc:`ROCm SMI installation <./install/install>`
.. grid-item-card:: API Reference
* :doc:`Files <../doxygen/html/files>`
* :doc:`Globals <../doxygen/html/globals>`
* :doc:`Data structures <../doxygen/html/annotated>`
* :doc:`Modules <../doxygen/html/modules>`
* :doc:`Python API <reference/python_api>`
.. grid-item-card:: How to
* :doc:`Use C++ in ROCm SMI <how-to/use-cpp>`
* :doc:`Use Python in ROCm SMI <how-to/use-python>`
.. grid-item-card:: Tutorials
* :doc:`C++ <tutorials/cpp_tutorials>`
* :doc:`Python <tutorials/python_tutorials>`
To contribute to the documentation, refer to `Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
@@ -0,0 +1,96 @@
.. meta::
:description: Install ROCm SMI
:keywords: install, SMI, library, api, AMD, ROCm
*********************
Installing ROCm SMI
*********************
Planned deprecation notice
----------------------------
ROCm System Management Interface (ROCm SMI) Library is planned to be ***deprecated***, and the release date will be announced soon. We recommend migration to AMD SMI.
Install amdgpu using ROCm
--------------------------
Use the following instructions to install AMDGPU using ROCm:
1. Install amdgpu driver. Refer to the following example, your release and link may differ. The `amdgpu-install --usecase=rocm` triggers both an amdgpu driver update and ROCm SMI packages to be installed on your device.
.. code-block:: shell
sudo apt update
wget https://repo.radeon.com/amdgpu-install/6.0.2/ubuntu/jammy/amdgpu-install_6.0.60002-1_all.deb
sudo apt install ./amdgpu-install_6.0.60002-1_all.deb
sudo amdgpu-install --usecase=rocm
* `rocm-smi --help`
Building ROCm SMI
******************
Addtional required software
============================
To build the ROCm SMI library, the following components are required.
The following software versions are what was used in development. Earlier versions are not guaranteed to work:
* CMake (v3.14.0)
* g++ (5.4.0)
To build the latest documentation, the following are required:
* Python 3.8+
* NPM (sass)
The source code for ROCm SMI is available on `Github <https://github.com/RadeonOpenCompute/rocm_smi_lib>`_.
After the ROCm SMI library git repository is cloned to a local Linux machine, use the following CMake build sequence to build the library. Specifically,
.. code-block:: shell
mkdir -p build
cd build
cmake ..
make -j $(nproc)
# Install library file and header; default location is /opt/rocm
make install
The built library will appear in the `build` folder.
To build the rpm and deb packages follow the above steps with:
.. code-block:: shell
make package
Building documentation
=======================
The following is an example of how to build the docs:
.. code-block:: shell
python3 -m venv .venv
.venv/bin/python3 -m pip install -r docs/sphinx/requirements.txt
.venv/bin/python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html
Building tests
=================
To verify the build and capability of ROCm SMI on your system and to see an example of how ROCm SMI can be used, you may build and run the tests that are available in the repo. To build the tests, follow these steps:
.. code-block:: bash
mkdir build
cd build
cmake -DBUILD_TESTS=ON ..
make -j $(nproc)
To run the test, execute the program `rsmitst` that is built from the preceding steps.
+4
View File
@@ -0,0 +1,4 @@
# License
```{include} ../License.txt
```
@@ -0,0 +1,16 @@
:orphan:
.. meta::
:description: Install ROCm SMI
:keywords: API, SMI, AMD, ROCm
******************
API reference
******************
This section provides technical descriptions and important information about the different ROCm SMI and library components.
* {doc}`Library <../doxygen/docBin/html/files>`
* {doc}`Functions <../doxygen/docBin/html/globals>`
* {doc}`Data structures <../doxygen/docBin/html/annotated>`
@@ -0,0 +1,13 @@
=====================
Python API reference
=====================
This section describes the ROCm SMI Python module API.
Functions
---------
.. automodule:: rocm_smi
:members:
:undoc-members:
:exclude-members: UIntegerTypes, validateIfMaxUint
@@ -0,0 +1 @@
_toc.yml
@@ -0,0 +1,43 @@
# Anywhere {branch} is used, the branch name will be substituted.
# These comments will also be removed.
defaults:
numbered: False
maxdepth: 6
root: index
subtrees:
- caption: Installation
entries:
- file: install/install
title: ROCm SMI installation
- caption: How to
entries:
- file: how-to/use-cpp
- file: how-to/use-python
- caption: API Reference
entries:
- file: doxygen/html/files
title: Files
- file: doxygen/html/globals
title: Globals
- file: doxygen/html/annotated
title: Data structures
- file: doxygen/html/modules
title: Modules
- file: reference/python_api
title: Python API
- caption: Tutorials
entries:
- file: tutorials/cpp_tutorials
title: C++
- file: tutorials/python_tutorials
title: Python
- caption: About
entries:
- file: license
title: License
@@ -0,0 +1 @@
rocm-docs-core[api_reference]==1.13.0
@@ -0,0 +1,169 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements.in
#
accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==0.7.16
# via sphinx
babel==2.15.0
# via
# pydata-sphinx-theme
# sphinx
beautifulsoup4==4.12.3
# via pydata-sphinx-theme
breathe==4.35.0
# via rocm-docs-core
certifi==2024.6.2
# via requests
cffi==1.16.0
# via
# cryptography
# pynacl
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# click-log
# doxysphinx
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
cryptography==42.0.8
# via pyjwt
deprecated==1.2.14
# via pygithub
docutils==0.21.2
# via
# breathe
# myst-parser
# pydata-sphinx-theme
# sphinx
doxysphinx==3.3.8
# via rocm-docs-core
fastjsonschema==2.19.1
# via rocm-docs-core
gitdb==4.0.11
# via gitpython
gitpython==3.1.43
# via rocm-docs-core
idna==3.7
# via requests
imagesize==1.4.1
# via sphinx
jinja2==3.1.4
# via
# myst-parser
# sphinx
libsass==0.22.0
# via doxysphinx
lxml==4.9.4
# via doxysphinx
markdown-it-py==3.0.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==2.1.5
# via jinja2
mdit-py-plugins==0.4.1
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
mpire==2.10.2
# via doxysphinx
myst-parser==4.0.0
# via rocm-docs-core
numpy==1.26.4
# via doxysphinx
packaging==24.1
# via
# pydata-sphinx-theme
# sphinx
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.15.3
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.3.0
# via rocm-docs-core
pygments==2.18.0
# via
# accessible-pygments
# mpire
# pydata-sphinx-theme
# sphinx
pyjson5==1.6.6
# via doxysphinx
pyjwt[crypto]==2.8.0
# via pygithub
pynacl==1.5.0
# via pygithub
pyparsing==3.1.2
# via doxysphinx
pyyaml==6.0.1
# via
# myst-parser
# rocm-docs-core
# sphinx-external-toc
requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core[api-reference]==1.13.0
# via -r requirements.in
smmap==5.0.1
# via gitdb
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.5
# via beautifulsoup4
sphinx==8.0.2
# via
# breathe
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
# sphinx-book-theme
# sphinx-copybutton
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
sphinx-book-theme==1.1.3
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
sphinx-design==0.6.1
# via rocm-docs-core
sphinx-external-toc==1.0.1
# via rocm-docs-core
sphinx-notfound-page==1.0.4
# via rocm-docs-core
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-devhelp==2.0.0
# via sphinx
sphinxcontrib-htmlhelp==2.1.0
# via sphinx
sphinxcontrib-jsmath==1.0.1
# via sphinx
sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
tomli==2.0.1
# via sphinx
tqdm==4.66.5
# via mpire
typing-extensions==4.12.2
# via
# pydata-sphinx-theme
# pygithub
urllib3==2.2.2
# via
# pygithub
# requests
wrapt==1.16.0
# via deprecated
@@ -0,0 +1,35 @@
.. meta::
:description: ROCm SMI tutorial
:keywords: install, SMI, library, api, AMD, ROCm
ROCm SMI C++ API tutorial
----------------------------
.. code-block:: c++
#include <stdint.h>
#include "rocm_smi/rocm_smi.h"
int main() {
rsmi_status_t ret;
uint32_t num_devices;
uint16_t dev_id;
// We will skip return code checks for this example, but it
// is recommended to always check this as some calls may not
// apply for some devices or ROCm releases
ret = rsmi_init(0);
ret = rsmi_num_monitor_devices(&num_devices);
for (int i=0; i < num_devices; ++i) {
ret = rsmi_dev_id_get(i, &dev_id);
// dev_id holds the device ID of device i, upon a
// successful call
}
ret = rsmi_shut_down();
return 0;
}
For more examples please check the `C++ example <https://github.com/ROCm/rocm_smi_lib/blob/develop/rocm_smi/example/rocm_smi_example.cc>`_
or `tests. <https://github.com/ROCm/rocm_smi_lib/tree/develop/tests/rocm_smi_test/functional>`_
@@ -0,0 +1,31 @@
.. meta::
:description: ROCm SMI Python tutorial
:keywords: install, SMI, library, api, AMD, ROCm
ROCm SMI Python API tutorial
-----------------------------
.. code-block:: python
import sys
sys.path.append("/opt/rocm/libexec/rocm_smi/")
try:
import rocm_smi
except ImportError:
raise ImportError("Could not import /opt/rocm/libexec/rocm_smi/rocm_smi.py")
class prof_utils:
def __init__(self, mode) -> None:
rocm_smi.initializeRsmi()
def getPower(self, device):
return rocm_smi.getPower(device)
def listDevices(self):
return rocm_smi.listDevices()
def getMemInfo(self, device):
(memUsed, memTotal) = rocm_smi.getMemInfo(device, "vram")
return round(float(memUsed)/float(memTotal) * 100, 2)
@@ -0,0 +1 @@
:orphan:
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+204
View File
@@ -0,0 +1,204 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2018-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_COMMON_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_COMMON_H_
#include <cstdint>
#include <memory>
#include <map>
#include <vector>
#include <string>
#include <unordered_set>
#define CHECK_DV_IND_RANGE \
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \
if (dv_ind >= smi.devices().size()) { \
return RSMI_STATUS_INVALID_ARGS; \
} \
#define GET_DEV_FROM_INDX \
CHECK_DV_IND_RANGE \
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind]; \
assert(dev != nullptr);
#define GET_DEV_AND_KFDNODE_FROM_INDX \
GET_DEV_FROM_INDX \
std::shared_ptr<amd::smi::KFDNode> kfd_node; \
if (smi.kfd_node_map().find(dev->kfd_gpu_id()) == \
smi.kfd_node_map().end()) { \
return RSMI_INITIALIZATION_ERROR; \
} \
kfd_node = smi.kfd_node_map()[dev->kfd_gpu_id()];
#define REQUIRE_ROOT_ACCESS \
if (amd::smi::RocmSMI::getInstance().euid()) { \
return RSMI_STATUS_PERMISSION; \
}
#define DEVICE_MUTEX \
amd::smi::pthread_wrap _pw(*amd::smi::GetMutex(dv_ind)); \
amd::smi::RocmSMI& smi_ = amd::smi::RocmSMI::getInstance(); \
bool blocking_ = !(smi_.init_options() & \
static_cast<uint64_t>(RSMI_INIT_FLAG_RESRV_TEST1)); \
amd::smi::ScopedPthread _lock(_pw, blocking_); \
if (!blocking_ && _lock.mutex_not_acquired()) { \
return RSMI_STATUS_BUSY; \
}
/* This group of macros is used to facilitate checking of support for rsmi_dev*
* "getter" functions. When the return buffer is set to nullptr, the macro will
* check the previously gathered device support data to see if the function,
* with possible variants (e.g., memory types, firmware types,...) and
* subvariants (e.g. monitors/sensors) are supported.
*/
// This macro assumes dev already available
#define CHK_API_SUPPORT_ONLY(RT_PTR, VR, SUB_VR) \
if ((RT_PTR) == nullptr) { \
try { \
if (!dev->DeviceAPISupported(__FUNCTION__, (VR), (SUB_VR))) { \
return RSMI_STATUS_NOT_SUPPORTED; \
} \
return RSMI_STATUS_INVALID_ARGS; \
} catch (const amd::smi::rsmi_exception& e) { \
debug_print( \
"Exception caught when checking if API is supported %s.\n", \
e.what()); \
return RSMI_STATUS_INVALID_ARGS; \
} \
}
#define CHK_SUPPORT(RT_PTR, VR, SUB_VR) \
GET_DEV_FROM_INDX \
CHK_API_SUPPORT_ONLY((RT_PTR), (VR), (SUB_VR))
#define CHK_SUPPORT_NAME_ONLY(RT_PTR) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) \
#define CHK_SUPPORT_VAR(RT_PTR, VR) \
CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT) \
#define CHK_SUPPORT_SUBVAR_ONLY(RT_PTR, SUB_VR) \
CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR)) \
#define DBG_FILE_ERROR(FN, WR_STR) \
if (env_ && env_->debug_output_bitfield & RSMI_DEBUG_SYSFS_FILE_PATHS) { \
std::cout << "*****" << __FUNCTION__ << std::endl; \
std::cout << "*****Opening file: " << (FN) << std::endl; \
if ((WR_STR) != nullptr) { \
std::cout << "***** for writing. Writing: \"" << (WR_STR) << "\""; \
} else { std::cout << "***** for reading.";} \
std::cout << std::endl; \
std::cout << " at " << __FILE__ << ":" << std::dec << __LINE__ << \
std::endl;\
}
#define DEBUG_LOG(WR_STR, VR) \
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \
if (smi.getEnv().debug_output_bitfield & RSMI_DEBUG_VAL) { \
if ((WR_STR) != nullptr) \
std::cout << (WR_STR) << " " << (VR) << std::endl;\
}
// Add different debug filters here, as powers of 2; e.g, 1, 2, 4, 8, ...
#define RSMI_DEBUG_SYSFS_FILE_PATHS 1<<0
#define RSMI_DEBUG_VAL 1<<1
struct rsmi_func_id_iter_handle {
uintptr_t func_id_iter;
uintptr_t container_ptr;
uint32_t id_type;
};
struct RocmSMI_env_vars {
// If RSMI_DEBUG_INFINITE_LOOP is non-zero, rsmi_init() will go into
// an infinite loop in debug builds. For release builds, this is
// ignored. This is useful for debugging RSMI applications with
// gdb. After attaching with gdb, the inf. loop can be exited and
// RSMI can be debugged.
uint32_t debug_inf_loop;
// Bitfield that is AND'd with various RSMI_DEBUG_* bits to determine
// which debugging information should be turned on. Env. variable
// RSMI_DEBUG_BITFIELD is used to set all the debug info bits.
uint32_t debug_output_bitfield;
// The integer value of sysfs field enum that is to be over-ridden.
// Env. variable RSMI_DEBUG_ENUM_OVERRIDE is used to specify this.
// A set of enum overrides, RSMI_DEBUG_ENUM_OVERRIDE now supports
// comma delimited values.
std::unordered_set<uint32_t> enum_overrides;
// If RSMI_LOGGING is set, enables logging.
// Otherwise unset values, signify logging is turned off.
uint32_t logging_on;
// Sysfs path overrides
// Env. var. RSMI_DEBUG_DRM_ROOT_OVERRIDE
const char *path_DRM_root_override;
// Env. var. RSMI_DEBUG_HWMON_ROOT_OVERRIDE
const char *path_HWMon_root_override;
// Env. var. RSMI_DEBUG_PP_ROOT_OVERRIDE
const char *path_power_root_override;
};
// Use this bit offset to store the label-mapped file index
#define MONITOR_TYPE_BIT_POSITION 16
#define MONITOR_IND_BIT_MASK ((1 << MONITOR_TYPE_BIT_POSITION) - 1)
// Support information data structures
typedef std::vector<uint64_t> SubVariant;
typedef SubVariant::const_iterator SubVariantIt;
typedef std::map<uint64_t, std::shared_ptr<SubVariant>> VariantMap;
typedef VariantMap::const_iterator VariantMapIt;
typedef std::map<std::string, std::shared_ptr<VariantMap>> SupportedFuncMap;
typedef SupportedFuncMap::const_iterator SupportedFuncMapIt;
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_COMMON_H_
+124
View File
@@ -0,0 +1,124 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_COUNTERS_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_COUNTERS_H_
#include <linux/perf_event.h>
#include <cstdint>
#include <vector>
#include <unordered_set>
#include <string>
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace smi {
namespace evt {
class RSMIEventGrpHashFunction {
public:
size_t operator()(const rsmi_event_group_t& grp) const {
return static_cast<size_t>(grp);
}
};
typedef std::unordered_set<rsmi_event_group_t, RSMIEventGrpHashFunction>
dev_evt_grp_set_t;
void
GetSupportedEventGroups(uint32_t dev_ind, dev_evt_grp_set_t*supported_grps);
struct evnt_info_t {
uint8_t start_bit;
uint8_t field_size;
uint64_t value;
};
struct perf_read_format_t {
union {
struct {
uint64_t value;
uint64_t enabled_time;
uint64_t run_time;
};
uint64_t values[3];
};
};
class Event {
public:
explicit Event(rsmi_event_type_t event, uint32_t dev_ind);
~Event(void);
int32_t openPerfHandle();
int32_t startCounter(void);
int32_t stopCounter(void);
uint32_t getValue(rsmi_counter_value_t *val);
uint32_t dev_file_ind(void) const {return dev_file_ind_;}
uint32_t dev_ind(void) const {return dev_ind_;}
private:
// perf_event_attr fields
std::vector<evnt_info_t> event_info_;
std::string evt_path_root_;
rsmi_event_type_t event_type_;
uint32_t dev_file_ind_;
uint32_t dev_ind_;
int32_t fd_;
perf_event_attr attr_;
uint64_t prev_cntr_val_;
int32_t get_event_file_info(void);
int32_t get_event_type(uint32_t *ev_type);
};
} // namespace evt
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_COUNTERS_H_
+297
View File
@@ -0,0 +1,297 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
#include <pthread.h>
#include <string>
#include <memory>
#include <utility>
#include <cstdint>
#include <vector>
#include <unordered_set>
#include <map>
#include <type_traits>
#include <optional>
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_power_mon.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_counters.h"
#include "rocm_smi/rocm_smi_properties.h"
#include "rocm_smi/rocm_smi_gpu_metrics.h"
#include "shared_mutex.h" //NOLINT
namespace amd {
namespace smi {
enum DevKFDNodePropTypes {
kDevKFDNodePropCachesCnt,
kDevKFDNodePropIoLinksCnt,
kDevKFDNodePropCPUCoreIdBase,
kDevKFDNodePropSimdIdBase,
kDevKFDNodePropMaxWavePerSimd,
kDevKFDNodePropLdsSz,
kDevKFDNodePropGdsSz,
kDevKFDNodePropNumGWS,
kDevKFDNodePropWaveFrontSize,
kDevKFDNodePropArrCnt,
kDevKFDNodePropSimdArrPerEng,
kDevKFDNodePropCuPerSimdArr,
kDevKFDNodePropSimdPerCU,
kDevKFDNodePropMaxSlotsScratchCu,
kDevKFDNodePropVendorId,
kDevKFDNodePropDeviceId,
kDevKFDNodePropLocationId,
kDevKFDNodePropDrmRenderMinor,
kDevKFDNodePropHiveId,
kDevKFDNodePropNumSdmaEngines,
kDevKFDNodePropNumSdmaXgmiEngs,
kDevKFDNodePropMaxEngClkFComp,
kDevKFDNodePropLocMemSz,
kDevKFDNodePropFwVer,
kDevKFDNodePropCapability,
kDevKFDNodePropDbgProp,
kDevKFDNodePropSdmaFwVer,
kDevKFDNodePropMaxEngClkCComp,
kDevKFDNodePropDomain,
};
enum DevInfoTypes {
kDevPerfLevel,
kDevOverDriveLevel,
kDevMemOverDriveLevel,
kDevDevID,
kDevXGMIPhysicalID,
kDevDevRevID,
kDevDevProdName,
kDevDevProdNum,
kDevVendorID,
kDevSubSysDevID,
kDevSubSysVendorID,
kDevGPUMClk,
kDevGPUSClk,
kDevDCEFClk,
kDevFClk,
kDevSOCClk,
kDevPCIEClk,
kDevPowerProfileMode,
kDevUsage,
kDevPowerODVoltage,
kDevVBiosVer,
kDevPCIEThruPut,
kDevErrCntSDMA,
kDevErrCntUMC,
kDevErrCntGFX,
kDevErrCntMMHUB,
kDevErrCntPCIEBIF,
kDevErrCntHDP,
kDevErrCntXGMIWAFL,
kDevErrCntFeatures,
kDevMemTotGTT,
kDevMemTotVisVRAM,
kDevMemTotVRAM,
kDevMemUsedGTT,
kDevMemUsedVisVRAM,
kDevMemUsedVRAM,
kDevVramVendor,
kDevPCIEReplayCount,
kDevUniqueId,
kDevDFCountersAvailable,
kDevMemBusyPercent,
kDevXGMIError,
kDevFwVersionAsd,
kDevFwVersionCe,
kDevFwVersionDmcu,
kDevFwVersionMc,
kDevFwVersionMe,
kDevFwVersionMec,
kDevFwVersionMec2,
kDevFwVersionMes,
kDevFwVersionMesKiq,
kDevFwVersionPfp,
kDevFwVersionRlc,
kDevFwVersionRlcSrlc,
kDevFwVersionRlcSrlg,
kDevFwVersionRlcSrls,
kDevFwVersionSdma,
kDevFwVersionSdma2,
kDevFwVersionSmc,
kDevFwVersionSos,
kDevFwVersionTaRas,
kDevFwVersionTaXgmi,
kDevFwVersionUvd,
kDevFwVersionVce,
kDevFwVersionVcn,
kDevSerialNumber,
kDevMemPageBad,
kDevNumaNode,
kDevGpuMetrics,
kDevGpuReset,
kDevAvailableComputePartition,
kDevComputePartition,
kDevMemoryPartition,
kDevAvailableMemoryPartition,
};
typedef struct {
std::vector<const char *> mandatory_depends;
std::vector<DevInfoTypes> variants;
} dev_depends_t;
class Device {
public:
explicit Device(std::string path, RocmSMI_env_vars const *e);
~Device(void);
void set_monitor(std::shared_ptr<Monitor> m) {monitor_ = m;}
std::string path(void) const {return path_;}
const std::shared_ptr<Monitor>& monitor() {return monitor_;}
const std::shared_ptr<PowerMon>& power_monitor() {return power_monitor_;}
void set_power_monitor(std::shared_ptr<PowerMon> pm) {power_monitor_ = pm;}
int readDevInfo(DevInfoTypes type, uint64_t *val);
int readDevInfoLine(DevInfoTypes type, std::string *line);
int readDevInfo(DevInfoTypes type, std::string *val);
int readDevInfo(DevInfoTypes type, std::vector<std::string> *retVec);
int readDevInfo(DevInfoTypes type, std::size_t b_size,
void *p_binary_data);
int writeDevInfo(DevInfoTypes type, uint64_t val);
int writeDevInfo(DevInfoTypes type, std::string val);
uint32_t index(void) const {return card_indx_;}
void set_card_index(uint32_t index) {card_indx_ = index;}
uint32_t drm_render_minor(void) const {return drm_render_minor_;}
void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;}
static rsmi_dev_perf_level perfLvlStrToEnum(std::string s);
uint64_t bdfid(void) const {return bdfid_;}
void set_bdfid(uint64_t val) {bdfid_ = val;}
pthread_mutex_t *mutex(void) {return mutex_.ptr;}
evt::dev_evt_grp_set_t* supported_event_groups(void) {
return &supported_event_groups_;}
SupportedFuncMap *supported_funcs(void) {return &supported_funcs_;}
uint64_t kfd_gpu_id(void) const {return kfd_gpu_id_;}
void set_kfd_gpu_id(uint64_t id) {kfd_gpu_id_ = id;}
void set_evt_notif_anon_file_ptr(FILE *f) {evt_notif_anon_file_ptr_ = f;}
FILE *evt_notif_anon_file_ptr(void) const {return evt_notif_anon_file_ptr_;}
void set_evt_notif_anon_fd(int fd) {evt_notif_anon_fd_ = fd;}
void set_evt_notif_anon_fd(uint32_t fd) {
evt_notif_anon_fd_ = static_cast<int>(fd);}
int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;}
void fillSupportedFuncs(void);
void DumpSupportedFunctions(void);
bool DeviceAPISupported(std::string name, uint64_t variant,
uint64_t sub_variant);
rsmi_status_t restartAMDGpuDriver(void);
rsmi_status_t isRestartInProgress(bool *isRestartInProgress,
bool *isAMDGPUModuleLive);
rsmi_status_t storeDevicePartitions(uint32_t dv_ind);
template <typename T> std::string readBootPartitionState(uint32_t dv_ind);
rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type);
void dev_set_gpu_metric(GpuMetricsBasePtr gpu_metrics_ptr) { m_gpu_metrics_ptr = std::move(gpu_metrics_ptr); };
GpuMetricsBasePtr& dev_get_gpu_metric() { return m_gpu_metrics_ptr; };
const AMDGpuMetricsHeader_v1_t& dev_get_metrics_header() {return m_gpu_metrics_header; }
rsmi_status_t setup_gpu_metrics_reading();
rsmi_status_t dev_read_gpu_metrics_header_data();
rsmi_status_t dev_read_gpu_metrics_all_data();
rsmi_status_t run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values);
rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics);
AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics();
static const std::map<DevInfoTypes, const char*> devInfoTypesStrings;
void set_smi_device_id(uint32_t i) { m_device_id = i; }
void set_smi_partition_id(uint32_t i) { m_partition_id = i; }
static const char* get_type_string(DevInfoTypes type);
rsmi_status_t get_smi_device_identifiers(uint32_t device_id,
rsmi_device_identifiers_t *device_identifiers);
private:
std::shared_ptr<Monitor> monitor_;
std::shared_ptr<PowerMon> power_monitor_;
std::string path_;
shared_mutex_t mutex_;
uint32_t card_indx_; // This index corresponds to the drm index (ie, card#)
uint32_t drm_render_minor_;
const RocmSMI_env_vars *env_;
template <typename T> int openDebugFileStream(DevInfoTypes type, T *fs,
const char *str = nullptr);
template <typename T> int openSysfsFileStream(DevInfoTypes type, T *fs,
const char *str = nullptr);
int readDebugInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec);
int readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
void *p_binary_data);
int writeDevInfoStr(DevInfoTypes type, std::string valStr,
bool returnWriteErr = false);
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
uint64_t bdfid_;
uint64_t kfd_gpu_id_;
std::unordered_set<rsmi_event_group_t,
evt::RSMIEventGrpHashFunction> supported_event_groups_;
// std::map<std::string, uint64_t> kfdNodePropMap_;
SupportedFuncMap supported_funcs_;
int evt_notif_anon_fd_;
FILE *evt_notif_anon_file_ptr_;
GpuMetricsBasePtr m_gpu_metrics_ptr;
AMDGpuMetricsHeader_v1_t m_gpu_metrics_header;
uint64_t m_gpu_metrics_updated_timestamp;
uint32_t m_device_id;
uint32_t m_partition_id;
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
@@ -0,0 +1,80 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
#include <exception>
#include <string>
#include "rocm_smi/rocm_smi.h"
#define THROW_IF_NULLPTR_DEREF(PTR) \
assert((PTR) != nullptr); \
if ((PTR) == nullptr) { \
throw amd::smi::rsmi_exception(RSMI_STATUS_INVALID_ARGS, __FUNCTION__); \
}
namespace amd {
namespace smi {
/// @brief Exception type which carries an error code to return to the user.
class rsmi_exception : public std::exception {
public:
rsmi_exception(rsmi_status_t error, const std::string description) :
err_(error), desc_(description) {}
rsmi_status_t error_code() const noexcept { return err_; }
const char* what() const noexcept override { return desc_.c_str(); }
private:
rsmi_status_t err_;
std::string desc_;
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
File diff suppressed because it is too large Load Diff
+137
View File
@@ -0,0 +1,137 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_IO_LINK_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_IO_LINK_H_
#include <string>
#include <vector>
#include <unordered_set>
#include <memory>
#include <map>
#include <utility>
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace smi {
typedef enum _IO_LINK_TYPE {
IOLINK_TYPE_UNDEFINED = 0,
IOLINK_TYPE_HYPERTRANSPORT = 1,
IOLINK_TYPE_PCIEXPRESS = 2,
IOLINK_TYPE_AMBA = 3,
IOLINK_TYPE_MIPI = 4,
IOLINK_TYPE_QPI_1_1 = 5,
IOLINK_TYPE_RESERVED1 = 6,
IOLINK_TYPE_RESERVED2 = 7,
IOLINK_TYPE_RAPID_IO = 8,
IOLINK_TYPE_INFINIBAND = 9,
IOLINK_TYPE_RESERVED3 = 10,
IOLINK_TYPE_XGMI = 11,
IOLINK_TYPE_XGOP = 12,
IOLINK_TYPE_GZ = 13,
IOLINK_TYPE_ETHERNET_RDMA = 14,
IOLINK_TYPE_RDMA_OTHER = 15,
IOLINK_TYPE_OTHER = 16,
IOLINK_TYPE_NUMIOLINKTYPES,
IOLINK_TYPE_SIZE = 0xFFFFFFFF
} IO_LINK_TYPE;
typedef enum _LINK_DIRECTORY_TYPE {
IO_LINK_DIRECTORY = 0,
P2P_LINK_DIRECTORY = 1
} LINK_DIRECTORY_TYPE;
class IOLink {
public:
explicit IOLink(uint32_t node_indx, uint32_t link_indx, LINK_DIRECTORY_TYPE link_dir_type) :
node_indx_(node_indx), link_indx_(link_indx), link_dir_type_(link_dir_type) {}
~IOLink();
int Initialize();
int ReadProperties(void);
int get_property_value(std::string property, uint64_t *value);
uint32_t get_node_indx(void) const {return node_indx_;}
uint32_t get_link_indx(void) const {return link_indx_;}
IO_LINK_TYPE type(void) const {return type_;}
uint32_t node_from(void) const {return node_from_;}
uint32_t node_to(void) const {return node_to_;}
uint64_t weight(void) const {return weight_;}
LINK_DIRECTORY_TYPE get_directory_type(void) const {return link_dir_type_;}
uint64_t min_bandwidth(void) const {return min_bandwidth_;}
uint64_t max_bandwidth(void) const {return max_bandwidth_;}
private:
uint32_t node_indx_;
uint32_t link_indx_;
IO_LINK_TYPE type_;
uint32_t node_from_;
uint32_t node_to_;
uint64_t weight_;
uint64_t min_bandwidth_;
uint64_t max_bandwidth_;
std::map<std::string, uint64_t> properties_;
LINK_DIRECTORY_TYPE link_dir_type_;
};
int
DiscoverIOLinksPerNode(uint32_t node_indx, std::map<uint32_t,
std::shared_ptr<IOLink>> *links);
int
DiscoverP2PLinksPerNode(uint32_t node_indx, std::map<uint32_t,
std::shared_ptr<IOLink>> *links);
int
DiscoverIOLinks(std::map<std::pair<uint32_t, uint32_t>,
std::shared_ptr<IOLink>> *links);
int
DiscoverP2PLinks(std::map<std::pair<uint32_t, uint32_t>,
std::shared_ptr<IOLink>> *links);
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_IO_LINK_H_
+135
View File
@@ -0,0 +1,135 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_KFD_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_KFD_H_
#include <string>
#include <vector>
#include <unordered_set>
#include <memory>
#include <map>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_io_link.h"
namespace amd {
namespace smi {
class KFDNode {
public:
explicit KFDNode(uint32_t node_ind) : node_indx_(node_ind) {}
~KFDNode();
int Initialize();
int ReadProperties(void);
int get_property_value(std::string property, uint64_t *value);
uint64_t gpu_id(void) const {return gpu_id_;}
std::string name(void) const {return name_;}
uint32_t node_index(void) const {return node_indx_;}
uint32_t numa_node_number(void) const {return numa_node_number_;}
uint64_t numa_node_weight(void) const {return numa_node_weight_;}
uint64_t xgmi_hive_id(void) const {return xgmi_hive_id_;}
uint32_t cu_count(void) const {return cu_count_;}
IO_LINK_TYPE numa_node_type(void) const {return numa_node_type_;}
int get_io_link_type(uint32_t node_to, IO_LINK_TYPE *type);
int get_io_link_weight(uint32_t node_to, uint64_t *weight);
int get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
uint64_t *min_bandwidth);
std::shared_ptr<Device> amdgpu_device(void) const {return amdgpu_device_;}
uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;}
void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;}
// Get memory from kfd
int get_total_memory(uint64_t* total);
int get_used_memory(uint64_t* used);
// Get gfx target version from kfd
int get_gfx_target_version(uint64_t* gfx_target_version);
// Get gpu_id (AKA GUID) version from kfd
int get_gpu_id(uint64_t *gpu_id);
// Get node id from kfd
int get_node_id(uint32_t *node_id);
private:
uint32_t node_indx_;
uint32_t amdgpu_dev_index_;
uint64_t gpu_id_;
std::string name_;
uint32_t numa_node_number_;
uint64_t numa_node_weight_;
IO_LINK_TYPE numa_node_type_;
uint64_t xgmi_hive_id_;
uint32_t cu_count_;
std::map<uint32_t, IO_LINK_TYPE> io_link_type_;
std::map<uint32_t, uint64_t> io_link_weight_;
std::map<uint32_t, uint64_t> io_link_max_bandwidth_;
std::map<uint32_t, uint64_t> io_link_min_bandwidth_;
std::map<uint32_t, std::shared_ptr<IOLink>> io_link_map_;
std::map<std::string, uint64_t> properties_;
std::shared_ptr<Device> amdgpu_device_;
};
int
DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes);
int
GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
uint32_t *num_procs_found);
int
GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
std::unordered_set<uint64_t> *gpu_set);
int
GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
int
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val);
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_KFD_H_
@@ -0,0 +1,86 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_LIB_LOADER_H_
#define ROCM_SMI_INCLUDE_ROCM_SMI_LIB_LOADER_H_
#include <dlfcn.h>
#include <string.h>
#include <map>
#include <iostream>
#include <mutex> // NOLINT(build/c++11)
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace smi {
class ROCmSmiLibraryLoader {
public:
ROCmSmiLibraryLoader();
rsmi_status_t load(const char* filename);
template<typename T> rsmi_status_t load_symbol(T* func_handler,
const char* func_name);
rsmi_status_t unload();
~ROCmSmiLibraryLoader();
private:
void* libHandler_;
std::mutex library_mutex_;
bool library_loaded_ = false;
};
template<typename T> rsmi_status_t ROCmSmiLibraryLoader::load_symbol(
T* func_handler,
const char* func_name) {
if (!libHandler_) {
return RSMI_STATUS_FAIL_LOAD_MODULE;
}
if (!func_handler || !func_name) {
return RSMI_STATUS_FAIL_LOAD_SYMBOL;
}
std::lock_guard<std::mutex> guard(library_mutex_);
*reinterpret_cast<void**>(func_handler) =
dlsym(libHandler_, func_name);
if (*func_handler == nullptr) {
char* error = dlerror();
std::cerr << "ROCmSmiLibraryLoader: Fail to load the symbol "
<< func_name << ": " << error << std::endl;
return RSMI_STATUS_FAIL_LOAD_SYMBOL;
}
return RSMI_STATUS_SUCCESS;
}
} // namespace smi
} // namespace amd
#endif // ROCM_SMI_INCLUDE_ROCM_SMI_LIB_LOADER_H_
@@ -0,0 +1,225 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
* Detail Description:
* Implemented complete logging mechanism, supporting multiple logging type
* like as file based logging, console base logging etc. It also supported
* for different log types.
*
* Thread Safe logging mechanism. Compatible with G++ (Linux platform)
*
* Supported Log Type: ERROR, ALARM, ALWAYS, INFO, BUFFER, TRACE, DEBUG
* No control for ERROR, ALRAM and ALWAYS messages. These type of messages
* should be always captured -- IF logging is enabled.
*
* WARNING: Logging is controlled by users environment variable - RSMI_LOGGING.
* Enabling RSMI_LOGGING, by export RSMI_LOGGING=<any value>. No logs will
* be printed, unless RSMI_LOGGING is enabled.
*
* BUFFER log type should be use while logging raw buffer or raw messages
* Having direct interface as well as C++ Singleton inface. Can use
* whatever interface fits your needs.
*/
#ifndef _ROCM_SMI_LOGGER_H_
#define _ROCM_SMI_LOGGER_H_
// C++ Header File(s)
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <mutex>
// POSIX Socket Header File(s)
#include <errno.h>
// Code Specific Header Files(s)
namespace ROCmLogging {
// Direct Interface for logging into log file or console using MACRO(s)
#define LOG_ERROR(x) (ROCmLogging::Logger::getInstance()->error(x))
#define LOG_ALARM(x) (ROCmLogging::Logger::getInstance()->alarm(x))
#define LOG_ALWAYS(x) (ROCmLogging::Logger::getInstance()->always(x))
#define LOG_INFO(x) (ROCmLogging::Logger::getInstance()->info(x))
#define LOG_BUFFER(x) (ROCmLogging::Logger::getInstance()->buffer(x))
#define LOG_TRACE(x) (ROCmLogging::Logger::getInstance()->trace(x))
#define LOG_DEBUG(x) (ROCmLogging::Logger::getInstance()->debug(x))
// enum for LOG_LEVEL
typedef enum LOG_LEVEL {
DISABLE_LOG = 1,
LOG_LEVEL_INFO = 2,
LOG_LEVEL_BUFFER = 3,
LOG_LEVEL_TRACE = 4,
LOG_LEVEL_DEBUG = 5,
ENABLE_LOG = 6,
} LogLevel;
// enum for LOG_TYPE
typedef enum LOG_TYPE {
NO_LOG = 1,
CONSOLE = 2,
FILE_LOG = 3,
BOTH_FILE_AND_CONSOLE = 4
} LogType;
class Logger {
public:
static Logger* getInstance() throw();
Logger& operator<<(std::string &s) {
switch (this->m_LogLevel) {
case DISABLE_LOG:
break;
case LOG_LEVEL_INFO:
info(s);
break;
case LOG_LEVEL_BUFFER:
buffer(s);
break;
case LOG_LEVEL_TRACE:
trace(s);
break;
case LOG_LEVEL_DEBUG:
debug(s);
break;
case ENABLE_LOG:
always(s);
break;
default:
break;
}
return *getInstance();
}
Logger &operator<<(const char* s) {
return operator<<(std::string(s));
}
template <class T> Logger &operator<<(const T &v) {
std::ostringstream s;
s << v;
std::string str = s.str();
return operator<<(str);
}
// Interface for Error Log
void error(const char* text) throw();
void error(std::string& text) throw();
void error(std::ostringstream& stream) throw();
// Interface for Alarm Log
void alarm(const char* text) throw();
void alarm(std::string& text) throw();
void alarm(std::ostringstream& stream) throw();
// Interface for Always Log
void always(const char* text) throw();
void always(std::string& text) throw();
void always(std::ostringstream& stream) throw();
// Interface for Buffer Log
void buffer(const char* text) throw();
void buffer(std::string& text) throw();
void buffer(std::ostringstream& stream) throw();
// Interface for Info Log
void info(const char* text) throw();
void info(std::string& text) throw();
void info(std::ostringstream& stream) throw();
// Interface for Trace log
void trace(const char* text) throw();
void trace(std::string& text) throw();
void trace(std::ostringstream& stream) throw();
// Interface for Debug log
void debug(const char* text) throw();
void debug(std::string& text) throw();
void debug(std::ostringstream& stream) throw();
// Error and Alarm log must be always enable
// Hence, there is no interfce to control error and alarm logs
// Interfaces to control log levels
void updateLogLevel(LogLevel logLevel);
void enableAllLogLevels(); // Enable all log levels
void disableLog(); // Disable all log levels, except error and alarm
// Interfaces to control log Types
void updateLogType(LogType logType);
void enableConsoleLogging();
void enableFileLogging();
std::string getLogSettings();
bool isLoggerEnabled();
protected:
Logger();
~Logger();
// Wrapper function for lock/unlock
// For Extensible feature, lock and unlock should be in protected
void lock();
void unlock();
std::string getCurrentTime();
private:
static Logger* m_Instance;
std::ofstream m_File;
bool m_loggingIsOn = false;
LogLevel m_LogLevel;
LogType m_LogType;
std::mutex m_Mutex;
std::unique_lock<std::mutex> m_Lock{m_Mutex, std::defer_lock};
void logIntoFile(std::string& data);
void logOnConsole(std::string& data);
void operator=(const Logger&) {}
void initialize_resources();
void destroy_resources();
};
} // namespace ROCmLogging
#endif // End of _ROCM_SMI_LOGGER_H_
+162
View File
@@ -0,0 +1,162 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_MAIN_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_MAIN_H_
#include <vector>
#include <memory>
#include <functional>
#include <set>
#include <string>
#include <cstdint>
#include <unordered_map>
#include <map>
#include <mutex> // NOLINT
#include <utility>
#include "rocm_smi/rocm_smi_io_link.h"
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_power_mon.h"
#include "rocm_smi/rocm_smi_common.h"
namespace amd {
namespace smi {
class RocmSMI {
public:
explicit RocmSMI(uint64_t flags);
~RocmSMI(void);
static RocmSMI& getInstance(uint64_t flags = 0);
void Initialize(uint64_t flags);
void Cleanup(void);
std::vector<std::shared_ptr<amd::smi::Device>>&
devices() {return devices_;}
uint32_t DiscoverAmdgpuDevices(void);
int DiscoverAMDPowerMonitors(bool force_update = false);
// Will execute "func" for every Device object known about, or until func
// returns non-zero;
uint32_t IterateSMIDevices(
std::function<uint32_t(std::shared_ptr<Device>&, void *)> func, void *);
void set_init_options(uint64_t options) {init_options_ = options;}
uint64_t init_options() const {return init_options_;}
uint64_t is_thread_only_mutex() const {
return init_options_ & RSMI_INIT_FLAG_THRAD_ONLY_MUTEX;
}
uint32_t euid() const {return euid_;}
std::map<uint64_t, std::shared_ptr<KFDNode>> & kfd_node_map(void) {
return kfd_node_map_;}
int kfd_notif_evt_fh(void) const {return kfd_notif_evt_fh_;}
void set_kfd_notif_evt_fh(int fd) {kfd_notif_evt_fh_ = fd;}
std::mutex *kfd_notif_evt_fh_mutex(void) {return &kfd_notif_evt_fh_mutex_;}
std::mutex *bootstrap_mutex(void) {return &bootstrap_mutex_;}
uint32_t ref_count(void) const {return ref_count_;}
uint32_t ref_count_inc(void) {return ++ref_count_;}
uint32_t ref_count_dec(void) {return --ref_count_;}
uint32_t kfd_notif_evt_fh_refcnt(void) const {
return kfd_notif_evt_fh_refcnt_;}
uint32_t kfd_notif_evt_fh_refcnt_inc(void) {
return ++kfd_notif_evt_fh_refcnt_;}
uint32_t kfd_notif_evt_fh_refcnt_dec(void) {
return --kfd_notif_evt_fh_refcnt_;}
int get_io_link_weight(uint32_t node_from, uint32_t node_to,
uint64_t *weight);
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
const RocmSMI_env_vars& getEnv(void);
std::string getRSMIEnvVarInfo(void);
void debugRSMIEnvVarInfo();
bool isLoggingOn(void);
uint32_t getLogSetting(void);
private:
std::vector<std::shared_ptr<Device>> devices_;
std::map<uint64_t, std::shared_ptr<KFDNode>> kfd_node_map_;
std::vector<std::shared_ptr<Monitor>> monitors_;
std::vector<std::shared_ptr<PowerMon>> power_mons_;
std::set<std::string> amd_monitor_types_;
std::map<std::pair<uint32_t, uint32_t>, std::shared_ptr<IOLink>>
io_link_map_;
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0);
typedef struct {
uint32_t card_index = std::numeric_limits<uint32_t>::max();
std::string dev_name = "";
std::string drm_render_path = "";
std::string drm_card_path = "";
uint32_t drm_render_minor = std::numeric_limits<uint32_t>::max();
uint64_t bdfid = std::numeric_limits<uint64_t>::max();
} rsmi_device_enumeration_t;
rsmi_status_t AddToDeviceList2(rsmi_device_enumeration_t device);
void GetEnvVariables(void);
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);
RocmSMI_env_vars env_vars_;
uint64_t init_options_;
uint32_t euid_;
int kfd_notif_evt_fh_;
std::mutex kfd_notif_evt_fh_mutex_;
uint32_t kfd_notif_evt_fh_refcnt_; // Access to this should be protected
// by kfd_notif_evt_fh_mutex_
std::mutex bootstrap_mutex_;
uint32_t ref_count_; // Access to this should be protected
// by bootstrap_mutex_
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_MAIN_H_
+176
View File
@@ -0,0 +1,176 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_MONITOR_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_MONITOR_H_
#include <string>
#include <cstdint>
#include <map>
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace smi {
enum MonitorTypes {
kMonName,
kMonTemp, // Temperature in millidegrees
kMonFanSpeed,
kMonMaxFanSpeed,
kMonFanRPMs,
kMonFanCntrlEnable,
kMonPowerCap,
kMonPowerCapDefault,
kMonPowerCapMax,
kMonPowerCapMin,
kMonPowerAve,
kMonPowerInput,
kMonPowerLabel,
kMonTempMax,
kMonTempMin,
kMonTempMaxHyst,
kMonTempMinHyst,
kMonTempCritical,
kMonTempCriticalHyst,
kMonTempEmergency,
kMonTempEmergencyHyst,
kMonTempCritMin,
kMonTempCritMinHyst,
kMonTempOffset,
kMonTempLowest,
kMonTempHighest,
kMonTempLabel,
kMonVolt,
kMonVoltMax,
kMonVoltMinCrit,
kMonVoltMin,
kMonVoltMaxCrit,
kMonVoltAverage,
kMonVoltLowest,
kMonVoltHighest,
kMonVoltLabel,
kMonInvalid = 0xFFFFFFFF,
};
const std::map<MonitorTypes, std::string> monitorTypesToString{
{MonitorTypes::kMonName, "MonitorTypes::kMonName"},
{MonitorTypes::kMonTemp, "MonitorTypes::kMonTemp"},
{MonitorTypes::kMonFanSpeed, "MonitorTypes::kMonFanSpeed"},
{MonitorTypes::kMonMaxFanSpeed, "MonitorTypes::kMonMaxFanSpeed"},
{MonitorTypes::kMonFanRPMs, "MonitorTypes::kMonFanRPMs"},
{MonitorTypes::kMonFanCntrlEnable, "MonitorTypes::kMonFanCntrlEnable"},
{MonitorTypes::kMonPowerCap, "MonitorTypes::kMonPowerCap"},
{MonitorTypes::kMonPowerCapDefault, "MonitorTypes::kMonPowerCapDefault"},
{MonitorTypes::kMonPowerCapMax, "MonitorTypes::kMonPowerCapMax"},
{MonitorTypes::kMonPowerCapMin, "MonitorTypes::kMonPowerCapMin"},
{MonitorTypes::kMonPowerAve, "MonitorTypes::kMonPowerAve"},
{MonitorTypes::kMonPowerInput, "MonitorTypes::kMonPowerInput"},
{MonitorTypes::kMonPowerLabel, "MonitorTypes::kMonPowerLabel"},
{MonitorTypes::kMonTempMax, "MonitorTypes::kMonTempMax"},
{MonitorTypes::kMonTempMin, "MonitorTypes::kMonTempMin"},
{MonitorTypes::kMonTempMaxHyst, "MonitorTypes::kMonTempMaxHyst"},
{MonitorTypes::kMonTempMinHyst, "MonitorTypes::kMonTempMinHyst"},
{MonitorTypes::kMonTempCritical, "MonitorTypes::kMonTempCritical"},
{MonitorTypes::kMonTempCriticalHyst, "MonitorTypes::kMonTempCriticalHyst"},
{MonitorTypes::kMonTempEmergency, "MonitorTypes::kMonTempEmergency"},
{MonitorTypes::kMonTempEmergencyHyst,
"MonitorTypes::kMonTempEmergencyHyst"},
{MonitorTypes::kMonTempCritMin, "MonitorTypes::kMonTempCritMin"},
{MonitorTypes::kMonTempCritMinHyst, "MonitorTypes::kMonTempCritMinHyst"},
{MonitorTypes::kMonTempOffset, "MonitorTypes::kMonTempOffset"},
{MonitorTypes::kMonTempLowest, "MonitorTypes::kMonTempLowest"},
{MonitorTypes::kMonTempHighest, "MonitorTypes::kMonTempHighest"},
{MonitorTypes::kMonTempLabel, "MonitorTypes::kMonTempLabel"},
{MonitorTypes::kMonVolt, "MonitorTypes::kMonVolt"},
{MonitorTypes::kMonVoltMax, "MonitorTypes::kMonVoltMax"},
{MonitorTypes::kMonVoltMinCrit, "MonitorTypes::kMonVoltMinCrit"},
{MonitorTypes::kMonVoltMin, "MonitorTypes::kMonVoltMin"},
{MonitorTypes::kMonVoltMaxCrit, "MonitorTypes::kMonVoltMaxCrit"},
{MonitorTypes::kMonVoltAverage, "MonitorTypes::kMonVoltAverage"},
{MonitorTypes::kMonVoltLowest, "MonitorTypes::kMonVoltLowest"},
{MonitorTypes::kMonVoltHighest, "MonitorTypes::kMonVoltHighest"},
{MonitorTypes::kMonVoltLabel, "MonitorTypes::kMonVoltLabel"},
{MonitorTypes::kMonInvalid, "MonitorTypes::kMonInvalid"},
};
class Monitor {
public:
explicit Monitor(std::string path, RocmSMI_env_vars const *e);
~Monitor(void);
const std::string path(void) const {return path_;}
int readMonitor(MonitorTypes type, uint32_t sensor_ind, std::string *val);
int writeMonitor(MonitorTypes type, uint32_t sensor_ind, std::string val);
int32_t setTempSensorLabelMap(void);
uint32_t getTempSensorIndex(rsmi_temperature_type_t type);
rsmi_temperature_type_t getTempSensorEnum(uint64_t ind);
int32_t setVoltSensorLabelMap(void);
uint32_t getVoltSensorIndex(rsmi_voltage_type_t type);
rsmi_voltage_type_t getVoltSensorEnum(uint64_t ind);
void fillSupportedFuncs(SupportedFuncMap *supported_funcs);
private:
std::string MakeMonitorPath(MonitorTypes type, uint32_t sensor_id);
std::string path_;
const RocmSMI_env_vars *env_;
std::map<rsmi_temperature_type_t, uint32_t> temp_type_index_map_;
std::map<rsmi_voltage_type_t, uint32_t> volt_type_index_map_;
// This map uses a 64b index instead of 32b (unlike temp_type_index_map_)
// for flexibility and simplicity. Currently, some parts of the
// implementation store both the RSMI api index and the file index into a
// single value. 32 bits is enough to store both, but we are using 64
// bits for simpler integration with existing implementation, which uses
// a 64b value. Also, if we need to encode anything else, 64b will give
// us more room to do so, without excessive changes.
std::map<uint64_t, rsmi_temperature_type_t> index_temp_type_map_;
std::map<uint64_t, rsmi_voltage_type_t> index_volt_type_map_;
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_MONITOR_H_
@@ -0,0 +1,80 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_POWER_MON_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_POWER_MON_H_
#include <string>
#include <cstdint>
#include "rocm_smi/rocm_smi_common.h"
namespace amd {
namespace smi {
enum PowerMonTypes {
kPowerMaxGPUPower,
};
class PowerMon {
public:
explicit PowerMon(std::string path, RocmSMI_env_vars const *e);
~PowerMon(void);
const std::string path(void) const {return path_;}
uint32_t dev_index(void) const {return dev_index_;}
void set_dev_index(uint32_t ind) {dev_index_ = ind;}
int readPowerValue(PowerMonTypes type, uint64_t *power);
private:
std::string path_;
const RocmSMI_env_vars *env_;
uint32_t dev_index_;
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_POWER_MON_H_
@@ -0,0 +1,160 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include <cstdint>
#include <map>
namespace amd {
namespace smi {
//
// Property reinforcement check list
//
using AMDGpuPropertyId_t = uint32_t;
using AMDGpuDevIdx_t = uint32_t;
using AMDGpuVerbId_t = uint32_t;
using AMDGpuAsicId_t = uint16_t;
using AMDGpuAsicRevId_t = uint16_t;
using AMDGpuOpModeType_t = uint8_t;
enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t
{
kNone = 0,
kSetGpuPciBandwidth,
kSetPowerCap,
kSetGpuPowerProfile,
kSetGpuClkRange,
kSetGpuOdClkInfo,
kSetGpuOdVoltInfo,
kSetGpuPerfLevelV1,
kSetGpuPerfLevel,
kGetGpuPowerProfilePresets,
kResetGpu,
kSetGpuPerfDeterminismMode,
kSetGpuFanSpeed,
kResetGpuFan,
kSetClkFreq,
kSetGpuOverdriveLevelV1,
kSetGpuOverdriveLevel,
kGetGpuFanRpms,
kGetGpuFanSpeed,
kGetGpuFanSpeedMax,
kGetGpuVoltMetric,
kGetGpuOverDriveLevel,
kGetGpuOdVoltInfo,
kGetGpuOdVoltCurveRegions,
};
using AMDGpuVerbList_t = std::map<AMDGpuVerbTypes_t, std::string>;
enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t
{
kNone = 0,
kDevInfoTypes = (0x1000 << 0),
kMonitorTypes = (0x1000 << 1),
kPerfTypes = (0x1000 << 2),
kClkTypes = (0x1000 << 3),
kVoltMetricTypes = (0x1000 << 4),
};
using AMDGpuPropertyOffsetType = std::underlying_type<AMDGpuPropertyTypesOffset_t>::type;
using AMDGpuPropertyTypesOffsetList_t = std::map<AMDGpuPropertyTypesOffset_t, std::string>;
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t
{
kBareMetal = (0x1 << 0),
kSrIov = (0x1 << 1),
kBoth = (0x1 << 2),
};
using AMDGpuPropertyOpModeType = std::underlying_type<AMDGpuPropertyOpModeTypes_t>::type;
using AMDGpuOpModeList_t = std::map<AMDGpuPropertyOpModeTypes_t, std::string>;
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
struct AMDGpuProperties_t
{
AMDGpuAsicRevId_t m_pci_rev_id;
AMDGpuPropertyId_t m_property;
AMDGpuVerbTypes_t m_verb_id;
AMDGpuPropertyOpModeTypes_t m_opmode;
bool m_should_be_available;
};
using AMDGpuPropertyList_t = std::multimap<AMDGpuAsicId_t, AMDGpuProperties_t>;
struct AMDGpuPropertyQuery_t
{
AMDGpuAsicId_t m_asic_id;
AMDGpuAsicRevId_t m_pci_rev_id;
AMDGpuDevIdx_t m_dev_idx;
AMDGpuPropertyId_t m_property;
AMDGpuVerbTypes_t m_verb_id;
};
//
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id);
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id);
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind,
AMDGpuVerbTypes_t dev_info_type,
rsmi_status_t actual_error_code);
void dump_amdgpu_property_reinforcement_list();
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
+686
View File
@@ -0,0 +1,686 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2018-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_UTILS_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_UTILS_H_
#include <pthread.h>
#include <algorithm>
#include <cstdint>
#include <iomanip>
#include <iosfwd>
#include <iostream>
#include <iterator>
#include <limits>
#include <ostream>
#include <queue>
#include <sstream>
#include <string>
#include <tuple>
#include <type_traits>
#include <vector>
#include <utility>
#include "rocm_smi/rocm_smi_device.h"
#ifdef NDEBUG
#define debug_print(fmt, ...) \
do { \
} while (false)
#else
#define debug_print(fmt, ...) \
do { \
fprintf(stderr, fmt, ##__VA_ARGS__); \
} while (false)
#endif
namespace amd {
namespace smi {
pthread_mutex_t *GetMutex(uint32_t dv_ind);
int SameFile(const std::string fileA, const std::string fileB);
bool FileExists(char const *filename);
std::vector<std::string> globFilesExist(const std::string& filePattern);
int isRegularFile(std::string fname, bool *is_reg);
int isReadOnlyForAll(const std::string& fname, bool *is_read_only);
int ReadSysfsStr(std::string path, std::string *retStr);
int WriteSysfsStr(std::string path, std::string val);
bool IsInteger(const std::string & n_str);
std::pair<bool, std::string> executeCommand(std::string command,
bool stdOut = true);
rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
std::string stateName, std::string storageData);
std::vector<std::string> getListOfAppTmpFiles();
bool containsString(std::string originalString, std::string substring,
bool displayComparisons = false);
std::tuple<bool, std::string> readTmpFile(
uint32_t dv_ind,
std::string stateName,
std::string parameterName);
void displayAppTmpFilesContent(void);
std::string debugVectorContent(std::vector<std::string> v);
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v);
rsmi_status_t handleException();
rsmi_status_t
GetDevValueVec(amd::smi::DevInfoTypes type,
uint32_t dv_ind, std::vector<std::string> *val_vec);
rsmi_status_t
GetDevBinaryBlob(amd::smi::DevInfoTypes type,
uint32_t dv_ind, std::size_t b_size, void* p_binary_data);
rsmi_status_t ErrnoToRsmiStatus(int err);
std::string getRSMIStatusString(rsmi_status_t ret, bool fullStatus = true);
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string, std::string>
getSystemDetails(void);
void logSystemDetails(void);
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
void logHexDump(const char *desc, const void *addr, const size_t len,
size_t perLine);
bool isSystemBigEndian();
std::string getBuildType();
std::string getMyLibPath();
std::string getFileCreationDate(std::string path);
int subDirectoryCountInPath(const std::string path);
std::queue<std::string> getAllDeviceGfxVers();
std::string monitor_type_string(amd::smi::MonitorTypes type);
std::string power_type_string(RSMI_POWER_TYPE type);
std::string splitString(std::string str, char delim);
std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv);
std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions,
rsmi_freq_volt_region_t *regions);
bool is_sudo_user();
rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind,
std::string *gfx_version);
std::string leftTrim(const std::string &s);
std::string rightTrim(const std::string &s);
std::string trim(const std::string &s);
std::string trimAllWhiteSpace(const std::string &s);
std::string removeWhitespace(const std::string &s);
std::string removeNewLines(const std::string &s);
std::string removeString(const std::string origStr,
const std::string &removeMe);
void system_wait(int milli_seconds);
int countDigit(uint64_t n);
std::string find_file_in_folder(const std::string& folder,
const std::string& regex);
uint64_t get_multiplier_from_char(char units_char);
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation = true,
int overloadBitSize = 0) {
std::stringstream ss;
if (showHexNotation) {
if (overloadBitSize == 0) {
ss << "0x" << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0');
} else {
// 8 bits per 1 byte
int byteSize = (overloadBitSize / 8) * 2;
ss << "0x" << std::hex << std::setw(byteSize) << std::setfill('0');
}
} else {
if (overloadBitSize == 0) {
ss << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0');
} else {
int byteSize = (overloadBitSize / 8) * 2;
ss << std::hex << std::setw(byteSize) << std::setfill('0');
}
}
if (std::is_same<std::uint8_t, T>::value) {
ss << static_cast<unsigned int>(i|0);
} else if (std::is_same<std::int8_t, T>::value) {
ss << static_cast<int>(static_cast<uint8_t>(i|0));
} else if (std::is_signed<T>::value) {
ss << static_cast<long long int>(i | 0);
} else {
ss << static_cast<unsigned long long int>(i | 0);
}
ss << std::dec;
return ss.str();
}
template <typename T>
std::string print_unsigned_int(T i) {
std::stringstream ss;
ss << static_cast<unsigned long long int>(i | 0);
return ss.str();
}
template <typename T>
std::string print_unsigned_hex_and_int(T i, std::string heading="") {
std::stringstream ss;
if (heading.empty() == false) {
ss << "\n" << heading << " = ";
}
ss << "Hex (MSB): " << print_int_as_hex(i) << ", "
<< "Unsigned int: " << print_unsigned_int(i) << ", "
<< "Byte Size: " << sizeof(T) << ", "
<< "Bits: " << sizeof(T) * 8; // 8 bits per 1 byte
return ss.str();
}
struct pthread_wrap {
public:
explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {}
void Acquire() {pthread_mutex_lock(&mutex_);}
int AcquireNB() {return pthread_mutex_trylock(&mutex_);}
void Release() {pthread_mutex_unlock(&mutex_);}
private:
pthread_mutex_t& mutex_;
};
struct ScopedPthread {
explicit ScopedPthread(pthread_wrap& mutex, bool blocking = true) : //NOLINT
pthrd_ref_(mutex), mutex_not_acquired_(false) {
if (blocking) {
pthrd_ref_.Acquire();
} else {
int ret = pthrd_ref_.AcquireNB();
if (ret == EBUSY) {
mutex_not_acquired_ = true;
}
}
}
~ScopedPthread() {
pthrd_ref_.Release();
}
bool mutex_not_acquired() {return mutex_not_acquired_;}
private:
ScopedPthread(const ScopedPthread&);
pthread_wrap& pthrd_ref_;
bool mutex_not_acquired_; // Use for AcquireNB (not for Aquire())
};
#define PASTE2(x, y) x##y
#define PASTE(x, y) PASTE2(x, y)
#define __forceinline __inline__ __attribute__((always_inline))
template <typename lambda>
class ScopeGuard {
public:
explicit __forceinline ScopeGuard(const lambda& release)
: release_(release), dismiss_(false) {}
ScopeGuard(const ScopeGuard& rhs) {*this = rhs; }
__forceinline ~ScopeGuard() {
if (!dismiss_) release_();
}
__forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
dismiss_ = rhs.dismiss_;
release_ = rhs.release_;
rhs.dismiss_ = true;
}
__forceinline void Dismiss() { dismiss_ = true; }
private:
lambda release_;
bool dismiss_;
};
template <typename lambda>
static __forceinline ScopeGuard<lambda> MakeScopeGuard(lambda rel) {
return ScopeGuard<lambda>(rel);
}
#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \
auto lname = __VA_ARGS__; \
amd::smi::ScopeGuard<decltype(lname)> sname(lname);
#define MAKE_SCOPE_GUARD(...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \
PASTE(scopeGuard, __COUNTER__), __VA_ARGS__)
#define MAKE_NAMED_SCOPE_GUARD(name, ...) \
MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \
__VA_ARGS__)
// A macro to disallow the copy and move constructor and operator= functions
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
TypeName(const TypeName&) = delete; \
TypeName(TypeName&&) = delete; \
void operator=(const TypeName&) = delete; \
void operator=(TypeName&&) = delete;
template <class LockType>
class ScopedAcquire {
public:
/// @brief: When constructing, acquire the lock.
/// @param: lock(Input), pointer to an existing lock.
explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) {
lock_->Acquire();}
/// @brief: when destructing, release the lock.
~ScopedAcquire() {
if (doRelease) lock_->Release();
}
/// @brief: Release the lock early. Avoid using when possible.
void Release() {
lock_->Release();
doRelease = false;
}
private:
LockType* lock_;
bool doRelease;
/// @brief: Disable copiable and assignable ability.
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire)
};
// The best effort way to decide whether it is in VM guest environment:
// In VM environment, the /proc/cpuinfo set hypervisor flag by default
bool is_vm_guest();
//
enum class TagSplitterPositional_t
{
kFIRST,
kBETWEEN,
kLAST,
kNONE,
};
template <typename PrimaryKeyType = std::string, typename PrimaryDataType = std::string,
typename SecondaryKeyType = PrimaryKeyType, typename SecondaryDataType = PrimaryDataType>
class TagTextContents_t
{
public:
using TextLines_t = std::vector<std::string>;
using PrimaryList_t = std::vector<PrimaryDataType>;
using SecondaryList_t = std::vector<SecondaryDataType>;
using PrimaryKeyTbl_t = std::map<PrimaryKeyType, PrimaryList_t>;
using SecondaryKeyTbl_t = std::map<SecondaryKeyType, SecondaryList_t>;
using StructuredKeysTbl_t = std::map<PrimaryDataType, std::map<SecondaryKeyType, SecondaryDataType>>;
//
TagTextContents_t() = default;
TagTextContents_t(const TagTextContents_t&) = delete;
TagTextContents_t(TagTextContents_t&&) = delete;
TagTextContents_t& operator=(const TagTextContents_t&) = delete;
TagTextContents_t& operator=(TagTextContents_t&&) = delete;
explicit TagTextContents_t(const TextLines_t& text_content)
: m_text_content(text_content) {}
TagTextContents_t& set_text_content(const TextLines_t& text_content)
{
m_text_content = text_content;
}
TagTextContents_t& set_title_terminator(const std::string& title_mark,
TagSplitterPositional_t title_mark_position) {
m_title_mark = title_mark;
m_title_mark_position = title_mark_position;
return *this;
}
TagTextContents_t& set_key_data_splitter(const std::string& line_splitter_mark,
TagSplitterPositional_t line_mark_position) {
m_line_splitter_mark = line_splitter_mark;
m_line_mark_position = line_mark_position;
return *this;
}
TagTextContents_t& structure_content() {
// Sanitizes the content.
if (!m_text_content.empty()) {
std::for_each(m_text_content.begin(), m_text_content.end(), trim);
section_title_lookup();
section_data_lookup();
}
return *this;
}
decltype(auto) get_title_size() {
return m_primary.size();
}
decltype(auto) get_structured_subkeys_size(const PrimaryKeyType& prim_key) {
return m_structured[prim_key].size();
}
decltype(auto) contains_title_key(const PrimaryKeyType& key) {
return (m_primary.find(key) != m_primary.end());
}
decltype(auto) contains_structured_key(const PrimaryKeyType& prim_key,
const SecondaryKeyType& sec_key) {
if (auto first_key_itr = m_structured.find(prim_key);
first_key_itr != m_structured.end()) {
if (auto sec_key_itr = first_key_itr->second.find(sec_key);
sec_key_itr != first_key_itr->second.end()) {
return true;
}
}
return false;
}
decltype(auto) get_structured_value_by_keys(const PrimaryKeyType& prim_key,
const SecondaryKeyType& sec_key,
bool is_value_id = true) {
if (auto first_key_itr = m_structured.find(prim_key);
first_key_itr != m_structured.end()) {
if (auto sec_key_itr = first_key_itr->second.find(sec_key);
sec_key_itr != first_key_itr->second.end()) {
SecondaryDataType key_value{};
if (is_value_id) {
key_value = SecondaryDataType(sec_key_itr->first) + " ";
}
key_value += sec_key_itr->second;
return key_value;
}
}
return SecondaryDataType{};
}
decltype(auto) get_structured_data_subkey_by_position(const PrimaryKeyType& prim_key,
uint32_t key_position) {
auto key_counter = uint32_t(0);
SecondaryKeyType data_key{};
if (key_position < (get_structured_subkeys_size(prim_key))) {
for (const auto& [sec_key, sec_value] : m_structured[prim_key]) {
if (key_counter == key_position) {
data_key = static_cast<SecondaryKeyType>(sec_key);
return data_key;
}
++key_counter;
}
}
return data_key;
}
decltype(auto) get_structured_data_subkey_first(const PrimaryKeyType& prim_key) {
return (get_structured_value_by_keys(prim_key,
get_structured_data_subkey_by_position(prim_key, 0)));
}
decltype(auto) get_structured_data_subkey_last(const PrimaryKeyType& prim_key) {
return (get_structured_value_by_keys(prim_key, get_structured_data_subkey_by_position(prim_key,
(get_structured_subkeys_size(prim_key) - 1))));
}
void reset() {
m_text_content.clear();
m_primary.clear();
m_structured.clear();
m_title_mark.clear();
m_line_splitter_mark.clear();
m_title_mark_position = TagSplitterPositional_t::kNONE;
m_line_mark_position = TagSplitterPositional_t::kNONE;
}
decltype(auto) dump_structured_content() {
std::ostringstream ostrstream;
ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
ostrstream << "** Primary Table **" << "\n";
for (const auto& [key, values] : m_primary) {
ostrstream << "key: " << key << " values: " << values.size() << "\n";
for (const auto& value : values) {
ostrstream << "\t value: " << value << "\n";
}
}
ostrstream << "\n ** Structured Table **" << "\n";
for (const auto& [prim_key, prim_values] : m_structured) {
ostrstream << "key: " << prim_key << "\n";
for (const auto& [sec_key, sec_value] : prim_values) {
ostrstream << "\t key: " << sec_key << " -> " << sec_value << "\n";
}
}
ostrstream << "\n\n";
return ostrstream.str();
}
private:
TextLines_t m_text_content;
PrimaryKeyTbl_t m_primary;
StructuredKeysTbl_t m_structured;
std::string m_title_mark;
std::string m_line_splitter_mark;
TagSplitterPositional_t m_title_mark_position;
TagSplitterPositional_t m_line_mark_position;
//
// Note: Organizes table with Title as a Key, and a list of values.
//
decltype(auto) section_title_lookup() {
if (m_title_mark.empty() ||
m_title_mark_position == TagSplitterPositional_t::kNONE) {
return;
}
//
// Note:
// - top_title_line: Left pointer for the sliding window
// - bottom_title_line: Right pointer for the sliding window
//
auto top_title_line = uint32_t(std::numeric_limits<uint32_t>::max());
auto bottom_title_line = uint32_t(std::numeric_limits<uint32_t>::max());
auto line_counter = uint32_t(0);
//
// Note: This whole interval/window where the section/title starts, and where it ends.
//
auto update_primary_tbl = [&](const uint32_t& from_line, const uint32_t& to_line) {
auto key = static_cast<PrimaryKeyType>(m_text_content[from_line]);
for (auto line_num(from_line + 1); line_num < to_line; ++line_num) {
if ((line_num < m_text_content.size()) && !m_text_content[line_num].empty()) {
m_primary[key].push_back(m_text_content[line_num]);
}
}
};
auto adjust_sliding_window = [&](const uint32_t& title_line) {
// First time top_title_line gets adjusted.
if (top_title_line == uint32_t(std::numeric_limits<uint32_t>::max())) {
top_title_line = title_line;
bottom_title_line = top_title_line;
return;
}
if (title_line > bottom_title_line) {
bottom_title_line = title_line;
update_primary_tbl(top_title_line, bottom_title_line);
top_title_line = bottom_title_line;
}
};
for (const auto& line : m_text_content) {
auto was_title_found{false};
switch (m_title_mark_position) {
case TagSplitterPositional_t::kFIRST:
// Section/Title Mark was found at the first position
if (line.find_first_of(m_title_mark.c_str()) == 0) {
was_title_found = true;
}
break;
case TagSplitterPositional_t::kLAST:
// Section/Title Mark was found at the last position
if ((line.find_last_of(m_title_mark.c_str()) + 1) == line.size()) {
was_title_found = true;
}
break;
default:
break;
}
if (was_title_found) {
adjust_sliding_window(line_counter);
}
++line_counter;
}
// Any remaining elements? If so, the data belongs to the last found section title
if (line_counter > bottom_title_line) {
update_primary_tbl(bottom_title_line, line_counter);
}
}
decltype(auto) section_data_lookup() {
if (m_line_splitter_mark.empty() ||
m_line_mark_position == TagSplitterPositional_t::kNONE) {
return;
}
//
// Note: Organizes table with Title as a Key, a Key/ID for values and values.
// It takes into consideration the initial constraints were all good and
// that the primary table has been populated.
auto sec_key = std::string();
auto sec_data = std::string();
auto auto_key = uint32_t(0);
for (const auto& [prim_key, prim_values] : m_primary) {
for (const auto& value : prim_values) {
if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str());
mark_pos != std::string::npos) {
sec_key = trim(value.substr(0, mark_pos + 1));
sec_data = trim(value.substr((mark_pos + 1), value.size()));
}
// In case there is no 'key' based on the data token marker, generate one.
else {
sec_key = std::to_string(auto_key) + m_line_splitter_mark;
sec_data = trim(value.substr(0, value.size()));
++auto_key;
}
if (!sec_key.empty()) {
m_structured[prim_key].insert(std::make_pair(sec_key, sec_data));
}
}
}
}
};
using TextFileTagContents_t = TagTextContents_t<std::string, std::string,
std::string, std::string>;
//
// Note: Output iterator that inserts a delimiter between elements.
//
template<typename DelimiterType, typename CharType = char,
typename TraitsType = std::char_traits<CharType>>
class ostream_joiner {
public:
using Char_t = CharType;
using Traits_t = TraitsType;
using Ostream_t = std::basic_ostream<Char_t, Traits_t>;
using iterator_category = std::output_iterator_tag;
using value_type = void;
using difference_type = void;
using pointer = void;
using reference = void;
ostream_joiner(Ostream_t* outstream,
const DelimiterType& delimiter) noexcept
(std::is_nothrow_copy_constructible_v<DelimiterType>)
: m_outstream(outstream), m_delimiter(delimiter) {}
ostream_joiner(Ostream_t* outstream, DelimiterType&& delimiter) noexcept
(std::is_nothrow_move_constructible_v<DelimiterType>)
: m_outstream(outstream), m_delimiter(std::move(delimiter)) {}
template<typename ValueType> ostream_joiner& operator=(const ValueType& value) {
if (!m_is_first) {
*m_outstream << m_delimiter;
}
this->m_is_first = false;
this->m_value_count++;
if ((m_value_count % kMAX_VALUES_PER_LINE) == 0) {
*m_outstream << "\n" << value;
this->m_value_count = 0;
} else {
*m_outstream << value;
}
return *this;
}
ostream_joiner& operator*() noexcept { return *this; }
ostream_joiner& operator++() noexcept { return *this; }
ostream_joiner& operator++(int) noexcept { return *this; }
private:
Ostream_t* m_outstream;
DelimiterType m_delimiter;
bool m_is_first = true;
uint32_t m_value_count = 0;
const uint32_t kMAX_VALUES_PER_LINE = 9;
};
/// Object generator for ostream_joiner.
template<typename CharType, typename TraitsType, typename DelimiterType>
inline ostream_joiner<std::decay_t<DelimiterType>, CharType, TraitsType>
make_ostream_joiner(std::basic_ostream<CharType, TraitsType>* outstream,
DelimiterType&& delimiter) {
return {
outstream,
std::forward<DelimiterType>(delimiter)
};
}
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_UTILS_H_
+2
View File
@@ -0,0 +1,2 @@
exclude = ['^file://.*', '.*localhost.*']
exclude_path = ["./build"]
+125
View File
@@ -0,0 +1,125 @@
#
# Minimum version of cmake required
#
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" CMake OAM (Library) ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
## Verbose output.
set(CMAKE_VERBOSE_MAKEFILE on)
# Required Defines first:
message("")
message("Build Configuration:")
# message("-----------BuildType: " ${CMAKE_BUILD_TYPE})
# message("------------Compiler: " ${CMAKE_CXX_COMPILER})
# message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
# message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
# message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
# message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
# message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
# message("--------RSMI Inc Dir: " ${OAM_INC_DIR})
# message("")
set(OAM_ROOT "${PROJECT_SOURCE_DIR}/oam")
set(OAM_NAME "oam")
set(OAM_COMPONENT "lib${OAM_NAME}")
set(OAM_TARGET "${OAM_NAME}")
################# Determine the library version #########################
set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver")
# VERSION_* variables should be set by get_version_from_tag
message("Package version: ${PKG_VERSION_STR}")
# Debian package specific variables
# Set a default value for the package version
get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
# VERSION_* variables should be set by get_version_from_tag
if ( ${ROCM_PATCH_VERSION} )
set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
else()
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
endif ()
set(${OAM_NAME}_VERSION_MAJOR "${VERSION_MAJOR}")
set(${OAM_NAME}_VERSION_MINOR "${VERSION_MINOR}")
set(${OAM_NAME}_VERSION_PATCH "0")
set(${OAM_NAME}_VERSION_BUILD "0")
message("SOVERSION: ${SO_VERSION_STRING}")
# Create a configure file to get version info from within library
configure_file(
"${OAM_ROOT}/src/${OAM_TARGET}Config.in"
"${OAM_ROOT}/include/oam/${OAM_TARGET}Config.h")
set(OAM_SRC_DIR "src")
set(OAM_INC_DIR "include")
set(OAM_DOCS_DIR "docs")
set(OAM_SRC_LIST ${CMN_SRC_LIST} "${OAM_SRC_DIR}/amd_oam.cc")
set(OAM_INC_LIST ${COMMON_INC_DIR} "${OAM_INC_DIR}")
set(OAM_EXAMPLE_EXE "oam_ex")
add_executable(${OAM_EXAMPLE_EXE} "example/oam_example.c")
target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST})
target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET})
add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST}
${CMN_INC_LIST} ${OAM_INC_LIST})
target_link_libraries(${OAM_TARGET} PRIVATE pthread rt dl)
target_include_directories(${OAM_TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex)
## Set the VERSION and SOVERSION values
set_property(TARGET ${OAM_TARGET} PROPERTY
SOVERSION "${VERSION_MAJOR}")
set_property(TARGET ${OAM_TARGET} PROPERTY
VERSION "${SO_VERSION_STRING}")
## If the library is a release, strip the target library
if ("${CMAKE_BUILD_TYPE}" STREQUAL Release)
if(${BUILD_SHARED_LIBS}) #striping only for .so
add_custom_command(
TARGET ${OAM_TARGET}
POST_BUILD COMMAND ${CMAKE_STRIP} lib${OAM_TARGET}.so)
endif()
endif ()
# use the target_include_directories() command to specify the include directories for the target
target_include_directories(${OAM_TARGET}
PUBLIC
"$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>"
"$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>"
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
"$<INSTALL_INTERFACE:{OAM_NAME}/include>")
## Add the install directives for the runtime library.
install(TARGETS ${OAM_TARGET}
EXPORT rocm_smiTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev)
install(TARGETS ${OAM_TARGET}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
COMPONENT asan)
install(FILES ${COMMON_SRC_ROOT}/oam/include/oam/oam_mapi.h
${COMMON_SRC_ROOT}/oam/include/oam/amd_oam.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/oam
COMPONENT dev)
# Generate Doxygen documentation
if (DOXYGEN_FOUND)
configure_file(${OAM_DOCS_DIR}/docs/rsmi_doxygen.cfg
${OAM_DOCS_DIR}/Doxyfile @ONLY)
add_custom_target(doc
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Generating AMD OAM API documentation with Doxygen" VERBATIM)
endif(DOXYGEN_FOUND)
@@ -0,0 +1,128 @@
#include <stdio.h>
#include "oam/oam_mapi.h"
#include "oam/amd_oam.h"
const oam_ops_t amd_oam_ops = {
.init = amdoam_init,
.free = amdoam_free,
// .get_mapi_version = amdoam_get_mapi_version,
.discover_devices = amdoam_discover_devices,
.get_dev_properties = amdoam_get_dev_properties,
.get_pci_properties = amdoam_get_pci_properties,
.get_sensors_count = amdoam_get_sensors_count,
.get_error_description = amdoam_get_error_description,
.get_sensors_info = amdoam_get_sensors_info,
};
static int get_sensor_info(uint32_t device_id, oam_sensor_type_t type,
uint32_t num_sensors, char unit[]) {
uint32_t j;
oam_sensor_info_t *sensor_info = calloc(num_sensors,
sizeof(oam_sensor_info_t));
if (!sensor_info) {
printf("Allocating power_info failed\n");
return -1;
}
amd_oam_ops.get_sensors_info(device_id, type, num_sensors, sensor_info);
for ( j = 0; j < num_sensors ; j++) {
printf("\tSensor Name : %s \n", sensor_info[j].sensor_name);
printf("\tSensor Type : %d \n", sensor_info[j].sensor_type);
printf("\tSensor Value : %ld %s\n", sensor_info[j].value, unit);
}
free(sensor_info);
printf("\t**************************************\n");
return 0;
}
int main()
{
uint32_t dev_cnt = 0;
oam_mapi_version_t version;
oam_dev_properties_t *devs_prop;
int i;
oam_pci_info_t pci_info;
oam_sensor_count_t sensor_count;
const char *string;
if (amd_oam_ops.init()) {
printf("init failed\n");
return -1;
}
// amd_oam_ops.get_mapi_version(&version);
if (!amd_oam_ops.discover_devices(&dev_cnt))
printf("%u AMD devices are discovered\n", dev_cnt);
if (!dev_cnt) {
printf("No devices are found.\n");
return amd_oam_ops.free();
}
devs_prop = calloc(dev_cnt, sizeof(oam_dev_properties_t));
if (!devs_prop) {
printf("Allocating dev_prop failed\n");
return amd_oam_ops.free();
}
amd_oam_ops.get_dev_properties(dev_cnt, devs_prop);
for (i = 0; i < dev_cnt; i++) {
printf("Device %d:\n", i);
printf("\tdevice id %d\n", devs_prop[i].device_id);
printf("\tdevice_vendor %s\n", devs_prop[i].device_vendor);
printf("\tdevice_name %s\n", devs_prop[i].device_name);
printf("\tsku_name %s\n", devs_prop[i].sku_name);
printf("\tboard_name %s\n", devs_prop[i].board_name);
printf("\tboard_revision %s\n", devs_prop[i].board_revision);
printf("\tboard_serial_number %s\n",
devs_prop[i].board_serial_number);
if (!amd_oam_ops.get_pci_properties(
devs_prop[i].device_id, &pci_info)) {
printf("\tPCI domain : 0x%d \n", pci_info.domain);
printf("\tPCI bus : 0x%d \n", pci_info.bus);
printf("\tPCI device : 0x%d \n", pci_info.device);
printf("\tPCI function : 0x%d \n", pci_info.function);
}
printf("\t**************************************\n");
if (amd_oam_ops.get_sensors_count(
devs_prop[i].device_id, &sensor_count))
continue;
printf("\tNumber of Power Sensors : %d \n",
sensor_count.num_power_sensors);
if (get_sensor_info(devs_prop[i].device_id,OAM_SENSOR_TYPE_POWER,
sensor_count.num_power_sensors, "uW"))
goto failure;
printf("\tNumber of Voltage Sensors : %d \n",
sensor_count.num_voltage_sensors);
if (get_sensor_info(devs_prop[i].device_id, OAM_SENSOR_TYPE_VOLTAGE,
sensor_count.num_voltage_sensors, "mV"))
goto failure;
printf("\tNumber of Current Sensors : %d \n",
sensor_count.num_current_sensors);
if (get_sensor_info(devs_prop[i].device_id, OAM_SENSOR_TYPE_CURRENT,
sensor_count.num_current_sensors, "A"))
goto failure;
printf("\tNumber of Temperature Sensors : %d \n",
sensor_count.num_temperature_sensors);
if (get_sensor_info(devs_prop[i].device_id, OAM_SENSOR_TYPE_TEMP,
sensor_count.num_temperature_sensors, "mC"))
goto failure;
printf("\tNumber of Fan Sensors : %d \n", sensor_count.num_fans);
if (get_sensor_info(devs_prop[i].device_id, OAM_SENSOR_TYPE_FAN_SPEED,
sensor_count.num_fans, "rpm"))
goto failure;
}
amd_oam_ops.get_error_description(1, &string);
printf("error code 1: %s\n", string);
failure:
free(devs_prop);
amd_oam_ops.free();
return 0;
}
+69
View File
@@ -0,0 +1,69 @@
/*
* MIT License
*
* Copyright (c) 2020 Open Compute Project
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef OAM_INCLUDE_OAM_AMD_OAM_H_
#define OAM_INCLUDE_OAM_AMD_OAM_H_
#ifdef __cplusplus
extern "C" {
#include <cstdint>
#else
#include <stdint.h>
#endif // __cplusplus
#include "oam/oam_mapi.h"
typedef enum {
AMDOAM_STATUS_SUCCESS = 0x0,
/* copy RSMI errors */
AMDOAM_STATUS_INVALID_ARGS,
AMDOAM_STATUS_NOT_SUPPORTED,
AMDOAM_STATUS_FILE_ERROR,
AMDOAM_STATUS_PERMISSION,
AMDOAM_STATUS_OUT_OF_RESOURCES,
AMDOAM_STATUS_INTERNAL_EXCEPTION,
AMDOAM_STATUS_INPUT_OUT_OF_BOUNDS,
AMDOAM_STATUS_INIT_ERROR,
/* end of RSMI error code */
AMDOAM_STATUS_ERROR, // Generic error return if not otherwise specified
AMDOAM_STATUS_NOT_FOUND,
} amdoam_status_t;
int amdoam_init(void);
int amdoam_free(void);
// int amdoam_get_mapi_version(oam_mapi_version_t *version);
int amdoam_discover_devices(uint32_t *device_count);
int amdoam_get_dev_properties(uint32_t dev_inx,
oam_dev_properties_t *prop);
int amdoam_get_pci_properties(uint32_t device_id, oam_pci_info_t *pci_info);
int amdoam_get_sensors_count(uint32_t device_id,
oam_sensor_count_t *sensor_count);
int amdoam_get_error_description(int code, const char **description);
int amdoam_get_sensors_info(uint32_t device_id, oam_sensor_type_t type,
uint32_t num_sensors, oam_sensor_info_t sensor_info[]);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // OAM_INCLUDE_OAM_AMD_OAM_H_
+635
View File
@@ -0,0 +1,635 @@
/*
* MIT License
*
* Copyright (c) 2020 Open Compute Project
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef OAM_INCLUDE_OAM_OAM_MAPI_H_
#define OAM_INCLUDE_OAM_OAM_MAPI_H_
/**
* \file oam_mapi.h
* \brief OAM management and monitoring library API definitions
*/
#if defined(__cplusplus)
extern "C" {
#endif
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
/**
* \struct oam_mapi_version_t
* \brief OAM library API version
* \details TBD
* All the libraries versions are expected to be backward compatible.
* The major version increment indicates a new API has been added.
* Minor version increment indicates an interface change.
*/
typedef struct oam_mapi_version {
uint32_t major;
uint32_t minor;
} oam_mapi_version_t;
/**
* \struct oam_dev_properties_t
* \brief Network identifier for the device
* \details Immutable network identifier for the device.
* This is unique across the entire network.
*/
typedef struct oam_net_dev_id {
/*!< unique network identifier for the device */
int network_id;
} oam_net_dev_id_t;
/*
* various lengths for device properties
*/
#define DEVICE_VENDOR_LEN 128
#define DEVICE_NAME_LEN 128
#define DEVICE_SKU_LEN 128
#define BOARD_NAME_LEN 128
#define BOARD_REVISION_LEN 128
#define BOARD_SERIAL_NUM_LEN 128
/**
* \struct oam_dev_properties_t
* \brief TBD
* \details TBD
*/
typedef struct oam_dev_properties {
/*!< Immutable local identifier for the device */
uint32_t device_id;
/*!< vendor name */
char device_vendor[DEVICE_VENDOR_LEN];
/*!< Device name */
char device_name[DEVICE_NAME_LEN];
/*!< SKU name */
char sku_name[DEVICE_SKU_LEN];
/*!< Board name */
char board_name[BOARD_NAME_LEN];
/*!< Board revision */
char board_revision[BOARD_REVISION_LEN];
/*!<
* Board Serial Number or UUID any other identifier, which can be used
* to identify devices uniquely and physically.
*/
char board_serial_number[BOARD_SERIAL_NUM_LEN];
} oam_dev_properties_t;
/**
* \struct oam_sensor_count_t
* \brief TBD
* \details TBD
* Various sensor related information
*/
typedef struct oam_sensor_count {
uint32_t num_temperature_sensors;
uint32_t num_power_sensors;
uint32_t num_voltage_sensors;
uint32_t num_current_sensors;
uint32_t num_fans;
} oam_sensor_count_t;
/**
* \enum oam_sensor_type_t
* \brief Sensor types
* \details This enumerated type defines available sensors types.
*/
typedef enum oam_sensor_type {
OAM_SENSOR_TYPE_POWER = 0,
OAM_SENSOR_TYPE_VOLTAGE,
OAM_SENSOR_TYPE_CURRENT,
OAM_SENSOR_TYPE_TEMP,
OAM_SENSOR_TYPE_FAN_SPEED,
OAM_SENSOR_TYPE_UNKNOWN
} oam_sensor_type_t;
/**
* \enum oam_power_sensor_scale_t
* \brief scale for power measurements
* \details This enumerated type defines available scales for power measurements
*/
typedef enum oam_power_sensor_scale {
OAM_POWER_SCALE_uW = 0,
OAM_POWER_SCALE_mW,
OAM_POWER_SCALE_W,
} oam_power_sensor_scale_t;
/**
* \enum oam_voltage_sensor_scale_t
* \brief scale for voltage measurements
* \details This enumerated type defines available scales for voltage measurements
*/
typedef enum oam_voltage_sensor_scale {
OAM_VOLTAGE_SCALE_uV = 0,
OAM_VOLTAGE_SCALE_mV,
OAM_VOLTAGE_SCALE_V,
} oam_voltage_sensor_scale_t;
/**
* \enum oam_current_sensor_scale_t
* \brief scale for current measurements
* \details This enumerated type defines available scales for current measurements
*/
typedef enum oam_current_sensor_scale {
OAM_CURRENT_SCALE_uA = 0,
OAM_CURRENT_SCALE_mA,
OAM_CURRENT_SCALE_A,
} oam_current_sensor_scale_t;
/**
* \enum oam_temp_sensor_scale_t
* \brief scale for temp measurements
* \details This enumerated type defines available scales for temp measurements
*/
typedef enum oam_temp_sensor_scale {
OAM_TEMP_SCALE_C = 0,
OAM_TEMP_SCALE_F
} oam_temp_sensor_scale_t;
/**
* \enum oam_fan_sensor_scale_t
* \brief scale for power measurements
* \details This enumerated type defines available scales for power measurements
*/
typedef enum oam_fan_sensor_scale {
OAM_FAN_SPEED_Hz = 0,
OAM_FAN_SPEED_KHz,
OAM_FAN_SPEED_MHz
} oam_fan_sensor_scale_t;
typedef union oam_sensor_scale {
oam_power_sensor_scale_t power_scale;
oam_voltage_sensor_scale_t volate_scale;
oam_current_sensor_scale_t current_scale;
oam_temp_sensor_scale_t temp_scale;
oam_fan_sensor_scale_t fan_scale;
} oam_sensor_scale_t;
/**
* \struct oam_dev_handle_t
* \brief Device handle
* \details Device handle obtained using open call
* The same handle is used by all the APIs which are used to perform
* specific operation on that device.
*/
typedef struct oam_dev_handle {
void *handle;
} oam_dev_handle_t;
/**
* \enum oam_dev_mode_t
* \brief Device open modes
* \details This enumerated type defines modes in which the device can be opened
* For some operations e.g. health check user should open the device
* in exclusive mode, so that if there are many applications using the same
* device there are no side effects.
*/
typedef enum oam_dev_mode {
OAM_DEV_MODE_EXCLUSIVE = 0,
OAM_DEV_MODE_NONEXLUSIVE = 1,
OAM_DEV_MODE_UNKNOWN = 0xFF
} oam_dev_mode_t;
/**
* \def OAM_SENSOR_NAME_MAX
* \brief length of sensor name
*/
#define OAM_SENSOR_NAME_MAX 256
/**
* \struct oam_sensor_info_t
* \brief Sensor information
* \details Structure to store various info of sensors.
*/
typedef struct oam_sensor_info {
char sensor_name[OAM_SENSOR_NAME_MAX];
oam_sensor_type_t sensor_type;
oam_sensor_scale_t scale;
int64_t value;
} oam_sensor_info_t;
/**
* \struct oam_dev_error_count_t
* \brief Device error information
* \details Various types of errors reported by device.
*/
typedef struct oam_dev_error_count {
uint32_t total_error_count;
uint32_t fatal_error_count;
uint32_t unknown_error_count;
uint32_t ecc_error_count;
} oam_dev_error_count_t;
/**
* \struct oam_firmware_version_t
* \brief Device error information
* \details Structure to store various firmware versions of OAM module
*/
typedef struct oam_firmware_version {
oam_mapi_version_t device_boot_fw_version;
oam_mapi_version_t device_fw_version;
oam_mapi_version_t board_boot_fw_version;
oam_mapi_version_t board_fw_version;
} oam_firmware_version_t;
/**
* \struct oam_pci_info_t
* \brief PCI information for the device
* \details Structure to store PCI (Domain, BDF) information of the device
*/
typedef struct oam_pci_info {
uint16_t domain;
uint8_t bus;
uint8_t device;
uint8_t function;
} oam_pci_info_t;
/**
* \enum oam_net_port_state_t
* \brief Network port state
* \details This enumerated type defines various states of the network port
*/
typedef enum oam_net_port_state {
OAM_NET_PORT_DISABLED = 0,
OAM_NET_PORT_ENABLED = 1
} oam_net_port_state_t;
/**
* \enum oam_net_port_status_t
* \brief Network port status
* \details This enumerated type defines various status of the network port
*/
typedef enum oam_net_port_status {
OAM_NET_PORT_UP = 0,
OAM_NET_PORT_DOWN = 1,
} oam_net_port_status_t;
/**
* \enum oam_net_port_id_t
* \brief Network port identifiers
* \details This enumerated type defines various identifiers for network ports
*/
typedef enum oam_net_port_id {
OAM_NET_PORT0 = 0,
OAM_NET_PORT1 = 1,
OAM_NET_PORT2 = 2,
OAM_NET_PORT_MAX = 0xFFFF
} oam_net_port_id_t;
/**
* \enum oam_firmware_modes_t
* \brief Supported mode to update firmware on device
* \details This enumerated type defines various modes which are supported by
* the device to update firmware.
*/
typedef enum oam_firmware_modes {
OAM_DOWNLOAD_ONLY = 0,
OAM_DOWNLOAD_ACTIVATE = 1
} oam_firmware_modes_t;
/**
* \def OAM_NET_PORT_NAME
* \brief length of network port name
*/
#define OAM_NET_PORT_NAME 256
/**
* \struct oam_net_port_desc
* \brief Network port description
* \details Structure to store additional details about the network port
*/
typedef struct oam_net_port_desc {
char name[OAM_NET_PORT_NAME];
} oam_net_port_desc_t;
/**
* \def OAM_DEV_HOST_NAME
* \brief length of host name
*/
#define OAM_DEV_HOST_NAME 256
/**
* \struct oam_net_dev_info_t
* \brief Information about the device on a network
* \details Structure to store additional details about the network device
* on a particular network.
*/
typedef struct oam_net_dev_info {
oam_net_dev_id_t net_dev_id;
char host_name[OAM_DEV_HOST_NAME];
oam_pci_info_t pci_info;
} oam_net_dev_info_t;
/**
* \struct oam_neighbour_info_t
* \brief Information about device neighburs
* \details Structure to store information about device neighbours on the
* network
*/
typedef struct oam_neighbour_info {
oam_net_port_id_t device_port;
oam_net_dev_info_t device_info;
} oam_neighbour_info_t;
/**
* \enum oam_dev_tpc_id_t
* \brief TPC identifiers
* \details This enumerated type defines various identifiers for TPCs
*/
typedef enum oam_dev_tpc_id {
OAM_DEV_TPC0,
OAM_DEV_TPC1,
OAM_DEV_TPC2,
OAM_DEV_TPC_MAX
} oam_dev_tpc_id_t;
/**
* \def OAM_TPC_NAME
* \brief length of TPC name
*/
#define OAM_TPC_NAME 256
/**
* \struct oam_tpc_desc_t
* \brief TPC description
* \details Structure to store information about TPC e.g. name corresponding
* to the id etc.
*/
typedef struct oam_tpc_desc {
char name[256];
} oam_tpc_desc_t;
/**
* \struct oam_dev_tpc_stats_t
* \brief TPC statistical information
* \details Structure to store information about TPC statistical information
* e.g. TPC utilization
*/
typedef struct oam_dev_tpc_stats {
double util;
} oam_dev_tpc_stats_t;
/**
* \enum oam_dev_mem_id_t
* \brief Device memory identifiers
* \details This enumerated type defines various identifiers for device memories
*/
typedef enum oam_dev_mem_id {
OAM_DEV_MEM0,
OAM_DEV_MEM1,
OAM_DEV_MEM2,
OAM_DEV_MEM_MAX
} oam_dev_mem_id_t;
/**
* \struct oam_mem_desc_t
* \brief Device memory description
* \details Structure to store additional details about device memories port
*/
typedef struct oam_mem_desc {
char name[256];
} oam_mem_desc_t;
/**
* \struct oam_dev_mem_stats_t
* \brief Device memory statistical information
* \details Structure to store various statastical information about device
* memory.
*/
typedef struct oam_dev_mem_stats {
uint32_t total_mem;
uint32_t allocated_mem;
uint32_t free_mem;
} oam_dev_mem_stats_t;
/**
* \struct oam_net_port_pkt_stats_t
* \brief Device network port statistical information
* \details Structure to store various statastical information about the network
* packets on a given port.
*/
typedef struct oam_net_port_pkt_stats {
uint64_t rx_count;
uint64_t tx_count;
uint64_t rx_errors;
uint64_t tx_errors;
} oam_net_port_pkt_stats_t;
/**
* \struct oam_ops_t
* \brief OAM Device operations
* \details Structure provides list of APIs which needs to be
* supported by the OAM library.
*/
typedef struct oam_ops {
/*!<
* to initialise library instance and perform version compatibility
* check
*/
int (*init)(void);
int (*free)(void);
/*!<
* To get error description from the error code
*/
int (*get_error_description)(int error_code, const char **error_description);
/*!<
* To retrieve the OAM Management interface version
*/
int (*get_mapi_version)(oam_mapi_version_t *version);
/*!<
* To retrieve the number of devices present/discovered by the library
*/
int (*discover_devices)(uint32_t *device_count);
/*!<
* To retrieve device properties for each discovered devices
*/
int (*get_dev_properties)(uint32_t device_count,
oam_dev_properties_t *devices);
/*!<
* To retrieve PCI properties of the device
*/
int (*get_pci_properties)(uint32_t device_id, oam_pci_info_t *pci_info);
/*!<
* To query the number of various sensors present
*/
int (*get_sensors_count)(uint32_t device_id,
oam_sensor_count_t *sensor_count);
/*!<
* Open the device and obtain handle
*/
int (*open_device)(uint32_t *dev_id, oam_dev_mode_t mode,
oam_dev_handle_t *handle);
int (*close_device)(oam_dev_handle_t *handle);
/*!<
* To read various sensor values for a given sensor type
*/
int (*get_sensors_info)(uint32_t device_id,
oam_sensor_type_t type,
uint32_t num_sensors,
oam_sensor_info_t sensor_info[]);
/*!<
* To read current error count of the device
*/
int (*get_device_error_count)(oam_dev_handle_t *handle,
oam_dev_error_count_t *count);
/*!<
* To update firmware on the device
* fw_image contains a null terminated string which specifies complete
* path where the firmware image is located
*/
int (*download_firmware)(uint32_t *device_id, char *fw_image,
oam_firmware_modes_t mode);
/*!<
* To query firmware versions
*/
int (*get_firmware_version)(uint32_t *device_id,
oam_firmware_version_t *version);
/*!<
* to get network id from device id
*/
int (*get_net_dev_id)(uint32_t *device_id, oam_net_dev_id_t *net_device);
/*!<
* Network management APIs.
*/
/*!<
* discover network.
*/
int (*discover_network)(int *net_dev_count);
int (*get_dev_net_properties)(oam_net_dev_info_t *net_dev_info);
int (*get_neighbour_count)(uint32_t *device,
oam_net_port_id_t local_port_id,
uint32_t *neighbor_count);
int (*get_neighbours_info)(uint32_t *device,
oam_net_port_id_t local_port_id,
uint32_t *neighbors_count,
oam_neighbour_info_t *neighbours_info);
int (*configure_network)(oam_net_dev_id_t *net_devices,
uint32_t *net_device_count,
char *network_name);
int (*destroy_network)(char *network_name);
int (*query_network)(char *network_name, oam_net_dev_info_t *devices,
uint32_t *device_count);
int (*get_network_count)(uint32_t *network_count);
int (*list_networks)(char *network_names[]);
/*!<
* Various statistics related to blocks
*/
/*!<
* To query number of ports
*/
int (*get_net_port_count)(oam_dev_handle_t *handle, uint32_t *count,
oam_net_port_id_t *port_ids);
int (*get_net_port_desc)(oam_dev_handle_t *handle, oam_net_port_id_t *port,
oam_net_port_desc_t *desc);
int (*get_net_port_state)(oam_dev_handle_t *handle, oam_net_port_id_t *port,
oam_net_port_state_t *state);
int (*check_net_port_status)(oam_dev_handle_t *handle,
oam_net_port_id_t *port,
oam_net_port_status_t *status);
int (*get_net_port_pkt_stats)(oam_dev_handle_t *handle,
oam_net_port_id_t *port,
uint32_t duration_sec,
oam_net_port_pkt_stats_t *stats);
int (*query_net_port_bandwidth)(oam_dev_handle_t *handle,
oam_net_port_id_t *port,
uint32_t duration_sec,
double *bandwidth);
int (*get_tpc_count)(oam_dev_handle_t *handle, uint32_t *count,
oam_dev_tpc_id_t *tpc_ids);
int (*get_tpc_desc)(oam_dev_handle_t *handle, oam_dev_tpc_id_t *tpc_id,
oam_tpc_desc_t *desc);
int (*get_tpc_stats)(oam_dev_handle_t *handle,
oam_dev_tpc_id_t *port,
oam_dev_tpc_stats_t *stats,
uint32_t duration_sec);
int (*get_mem_count)(oam_dev_handle_t *handle, uint32_t *count,
oam_dev_mem_id_t *mem_ids);
int (*get_mem_desc)(oam_dev_handle_t *handle, oam_dev_mem_id_t *tpc_id,
oam_mem_desc_t *desc);
int (*get_mem_stats)(oam_dev_handle_t *handle, oam_dev_mem_id_t *mem_id,
oam_dev_mem_stats_t *stats);
/*!<
* To check the health of the individual components, libraries
* generates test workload to check if the block is functioning properly
* or not. So no other workload should be running while calling these
* APIs
*/
int (*check_tpc_health)(uint32_t *device_id, oam_dev_tpc_id_t *tpc_id);
int (*check_net_port_health)(uint32_t *device_id, oam_net_port_id_t *port);
int (*check_mem_health)(uint32_t *device_id, oam_dev_mem_id_t *port);
/*
* Following needs more attention, will work on in next
int (*get_fan_speed)(oam_dev_t *oam);
int (*set_fan_speed)(oam_dev_t *oam, int speed);
int (*get_power_cap)(oam_dev_t *oam);
int (*set_power_cap)(oam_dev_t *oam, int power);
int (*get_telemetry)(oam_dev_t *oam);
*/
} oam_ops_t;
#ifdef __cplusplus
}
#endif
#endif // OAM_INCLUDE_OAM_OAM_MAPI_H_
+395
View File
@@ -0,0 +1,395 @@
/*
* MIT License
*
* Copyright (c) 2020 Open Compute Project
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <assert.h>
#include <dirent.h>
#include <sstream>
#include <cstring>
#include <iostream>
#include <regex> // NOLINT
#include <map>
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_counters.h"
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi.h"
#include "oam/oam_mapi.h"
#include "oam/amd_oam.h"
static const std::map<int, const char *> err_map = {
{ AMDOAM_STATUS_INVALID_ARGS, "Invalid arguments" },
{ AMDOAM_STATUS_NOT_SUPPORTED, "Feature not supported" },
{ AMDOAM_STATUS_FILE_ERROR, "Problem accessing a file" },
{ AMDOAM_STATUS_PERMISSION, "Permission denied" },
{ AMDOAM_STATUS_OUT_OF_RESOURCES, "Not enough memory or other resource" },
{ AMDOAM_STATUS_INTERNAL_EXCEPTION, "An internal exception was caught" },
{ AMDOAM_STATUS_INPUT_OUT_OF_BOUNDS,
"The provided input is out of allowable or safe range" },
{ AMDOAM_STATUS_INIT_ERROR, "AMDOAM is not initialized or init failed" },
{ AMDOAM_STATUS_ERROR, "Generic error" },
{ AMDOAM_STATUS_NOT_FOUND, "An item was searched for but not found" }
};
#define TRY try {
#define CATCH } catch (...) {return handleRSMIException();}
static bool rsmi_initialized;
static int rsmi_status_to_amdoam_errorcode(rsmi_status_t status) {
if (status > RSMI_STATUS_INIT_ERROR)
return -AMDOAM_STATUS_ERROR;
else
return -1 * static_cast<int>(status);
}
static int handleRSMIException() {
rsmi_status_t ret = amd::smi::handleException();
return rsmi_status_to_amdoam_errorcode(ret);
}
int amdoam_get_error_description(int code, const char **description) {
if (description == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
auto search = err_map.find(code);
if (search == err_map.end())
return -AMDOAM_STATUS_NOT_FOUND;
*description = search->second;
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_init(void) {
TRY
rsmi_status_t status = rsmi_init(0);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
rsmi_initialized = true;
return AMDOAM_STATUS_SUCCESS;
CATCH
}
int amdoam_free(void) {
rsmi_status_t status = rsmi_shut_down();
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_discover_devices(uint32_t *device_count) {
rsmi_status_t status;
if (device_count == nullptr) {
return -AMDOAM_STATUS_INVALID_ARGS;
}
status = rsmi_num_monitor_devices(device_count);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_pci_properties(uint32_t device_id, oam_pci_info_t *pci_info) {
uint64_t bdfid;
TRY
if (pci_info == nullptr) {
return -AMDOAM_STATUS_INVALID_ARGS;
}
rsmi_status_t status = rsmi_dev_pci_id_get(device_id, &bdfid);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
pci_info->domain = (uint16_t)(bdfid >> 32) & 0xffff;
pci_info->bus = (bdfid >> 8) & 0xff;
pci_info->device = (bdfid >> 3) & 0x1f;
pci_info->function = bdfid & 0x7;
CATCH
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_dev_properties(uint32_t num_devices,
oam_dev_properties_t *devices) {
const size_t buf_size = 32;
char buf[buf_size] = "";
uint32_t dev_inx;
oam_dev_properties_t *dev = devices;
TRY
if (devices == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
if (!rsmi_initialized)
return -AMDOAM_STATUS_INIT_ERROR;
for (dev_inx = 0; dev_inx < num_devices; dev_inx++) {
dev->device_id = dev_inx;
/* If fails to get any following properties, it's not treated as a deal
* breaker. Variable not filled means that property is not available on
* this device or AMD doesn't support that property.
*/
rsmi_dev_vendor_name_get(dev_inx, dev->device_vendor, DEVICE_VENDOR_LEN);
rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN);
rsmi_dev_vbios_version_get(dev_inx, buf, buf_size);
if (std::strlen(buf) > 0) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
std::strncpy(dev->sku_name, &buf[4], 6);
std::strncpy(dev->board_name, buf, 12);
#pragma GCC diagnostic pop
}
rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number,
BOARD_SERIAL_NUM_LEN);
++dev;
}
CATCH
return AMDOAM_STATUS_SUCCESS;
}
static uint32_t
get_num_sensors(std::string hwmon_path, std::string fn_reg) {
uint32_t sensor_max = 0;
std::string fn_reg_ex = "\\b" + fn_reg + "([0-9]+)([^ ]*)";
std::string fn;
std::smatch m;
int32_t temp = 0;
std::string s1("in");
std::regex re(fn_reg_ex);
auto hwmon_dir = opendir(hwmon_path.c_str());
assert(hwmon_dir != nullptr);
auto dentry = readdir(hwmon_dir);
while (dentry != nullptr) {
fn = dentry->d_name;
if (std::regex_search(fn, m, re)) {
std::string output = std::regex_replace(
fn,
std::regex("[^0-9]*([0-9]+).*"),
std::string("$1"));
temp = stoi(output);
assert(temp >= 0);
if (s1.compare(fn_reg) == 0)
++temp;
if (static_cast<uint32_t>(temp) > sensor_max)
sensor_max = static_cast<uint32_t>(temp);
}
dentry = readdir(hwmon_dir);
}
closedir(hwmon_dir);
return sensor_max;
}
int amdoam_get_sensors_count(uint32_t device_id,
oam_sensor_count_t *sensor_count) {
uint32_t dv_ind = device_id;
TRY
if (sensor_count == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
GET_DEV_FROM_INDX
assert(dev->monitor() != nullptr);
std::string hwmon_path = dev->monitor()->path();
sensor_count->num_temperature_sensors = get_num_sensors(hwmon_path, "temp");
sensor_count->num_fans = get_num_sensors(hwmon_path, "fan");
sensor_count->num_voltage_sensors = get_num_sensors(hwmon_path, "in");
sensor_count->num_power_sensors = get_num_sensors(hwmon_path, "power");
sensor_count->num_current_sensors = get_num_sensors(hwmon_path, "current");
CATCH
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_sensors_info(uint32_t device_id, oam_sensor_type_t type,
uint32_t num_sensors, oam_sensor_info_t sensor_info[]) {
uint32_t dv_ind = device_id;
std::string val_str;
uint32_t i;
rsmi_status_t status;
TRY
if ((sensor_info == nullptr) || (type >= OAM_SENSOR_TYPE_UNKNOWN))
return -AMDOAM_STATUS_INVALID_ARGS;
GET_DEV_FROM_INDX
assert(dev->monitor() != nullptr);
switch (type) {
case OAM_SENSOR_TYPE_POWER:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"POWER_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_power_ave_get(device_id, i,
reinterpret_cast<uint64_t*>(&sensor_info[i].value));
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_VOLTAGE:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"VOLTAGE_SENSOR_%u", i);
sensor_info[i].sensor_type = type;
status = rsmi_dev_volt_metric_get(device_id, RSMI_VOLT_TYPE_VDDGFX,
RSMI_VOLT_CURRENT, &sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_TEMP:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"TEMP_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_temp_metric_get(device_id, i, RSMI_TEMP_CURRENT,
&sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_FAN_SPEED:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"FAN_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_fan_speed_get(device_id, i, &sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
default:
return -AMDOAM_STATUS_NOT_SUPPORTED;
}
CATCH
return AMDOAM_STATUS_SUCCESS;
}
// TODO(x): This function doesn't work for OAM. It's just a version
// of rsmi_dev_ecc_count_get(), which has similar functionality.
// The purpose here is just to drive refactoring; e.g., making macros
// available and previously static functions global.
int
get_device_error_count(oam_dev_handle_t *handle,
oam_dev_error_count_t *count) {
std::vector<std::string> val_vec;
rsmi_status_t ret;
TRY
// TODO(x): replace with final code...
// Below, we are just returning errors for RSMI_GPU_BLOCK_GFX as a
// placeholder
(void)handle; // Just ignore for now
rsmi_gpu_block_t block = RSMI_GPU_BLOCK_GFX;
// The macro CHK_SUPPORT_VAR assumes the existence of a device index variable
// "dv_ind". Presumably, the device index will come from the "handle"
// pointer. Since I don't know how that will be implemented, for now we
// will just make up a device index:
uint32_t dv_ind = 0;
CHK_SUPPORT_VAR(count, block)
amd::smi::DevInfoTypes type;
switch (block) {
case RSMI_GPU_BLOCK_UMC:
type = amd::smi::kDevErrCntUMC;
break;
case RSMI_GPU_BLOCK_SDMA:
type = amd::smi::kDevErrCntSDMA;
break;
case RSMI_GPU_BLOCK_GFX:
type = amd::smi::kDevErrCntGFX;
break;
case RSMI_GPU_BLOCK_MMHUB:
type = amd::smi::kDevErrCntMMHUB;
break;
case RSMI_GPU_BLOCK_PCIE_BIF:
type = amd::smi::kDevErrCntPCIEBIF;
break;
case RSMI_GPU_BLOCK_HDP:
type = amd::smi::kDevErrCntHDP;
break;
case RSMI_GPU_BLOCK_XGMI_WAFL:
type = amd::smi::kDevErrCntXGMIWAFL;
break;
default:
return RSMI_STATUS_NOT_SUPPORTED;
}
DEVICE_MUTEX
ret = GetDevValueVec(type, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
return static_cast<int>(ret);
}
assert(val_vec.size() == 2);
std::string junk;
std::istringstream fs1(val_vec[0]);
fs1 >> junk;
assert(junk == "ue:");
fs1 >> count->total_error_count;
std::istringstream fs2(val_vec[1]);
fs2 >> junk;
assert(junk == "ce:");
fs2 >> count->total_error_count;
return static_cast<int>(ret);
CATCH
}
+56
View File
@@ -0,0 +1,56 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
// This file is generated on build.
#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
+6
View File
@@ -0,0 +1,6 @@
if (CPACK_GENERATOR MATCHES "DEB")
set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}_amd64")
elseif (CPACK_GENERATOR MATCHES "RPM")
set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}.x86_64")
endif()
@@ -0,0 +1,459 @@
# Radeon Open Compute (ROCm) - System Management Interface - Command Line Tool
This tool acts as a command line interface for manipulating
and monitoring the amdgpu kernel, and is intended to replace
and deprecate the existing rocm_smi.py CLI tool located at
https://github.com/ROCm/ROC-smi.
This tool uses Ctypes to call the rocm_smi_lib API.
Recommended: At least one AMD GPU with ROCm driver installed
Required: ROCm SMI library installed (librocm_smi64)
## Installation
Follow installation procedure for rocm_smi_lib. Refer to [https://github.com/RadeonOpenCompute/rocm_smi_lib](https://github.com/RadeonOpenCompute/rocm_smi_lib) for the installation guide.
LD_LIBRARY_PATH should be set to the folder containing librocm_smi64.
## Version
The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version.
- ROCM-SMI version is the CLI/tool version number with commit ID appended after + sign.
- ROCM-SMI-LIB version is the library package version number.
```
ROCM-SMI version: 2.0.0+8e78352
ROCM-SMI-LIB version: 6.1.0
```
## Usage
For detailed and up to date usage information, we recommend consulting the help:
/opt/rocm/bin/rocm-smi -h
For convenience purposes, following is the output from the -h flag:
```
/opt/rocm/bin/rocm-smi -h
usage: rocm-smi [-h] [-V] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a] [-i] [-v] [-e [EVENT [EVENT ...]]]
[--showdriverversion] [--showtempgraph] [--showfwinfo [BLOCK [BLOCK ...]]] [--showmclkrange]
[--showmemvendor] [--showsclkrange] [--showproductname] [--showserial] [--showuniqueid]
[--showvoltagerange] [--showbus] [--showpagesinfo] [--showpendingpages] [--showretiredpages]
[--showunreservablepages] [-f] [-P] [-t] [-u] [--showmemuse] [--showvoltage] [-b] [-c] [-g] [-l] [-M]
[-m] [-o] [-p] [-S] [-s] [--showmeminfo TYPE [TYPE ...]] [--showpids [VERBOSE]]
[--showpidgpus [SHOWPIDGPUS [SHOWPIDGPUS ...]]] [--showreplaycount]
[--showrasinfo [SHOWRASINFO [SHOWRASINFO ...]]] [--showvc] [--showxgmierr] [--showtopo]
[--showtopoaccess] [--showtopoweight] [--showtopohops] [--showtopotype] [--showtoponuma]
[--showenergycounter] [--shownodesbw] [--showcomputepartition] [--showmemorypartition] [-r]
[--resetfans] [--resetprofile] [--resetpoweroverdrive] [--resetxgmierr] [--resetperfdeterminism]
[--setclock TYPE LEVEL] [--setsclk LEVEL [LEVEL ...]]
[--setmclk LEVEL [LEVEL ...]] [--setpcie LEVEL [LEVEL ...]] [--setslevel SCLKLEVEL SCLK SVOLT]
[--setmlevel MCLKLEVEL MCLK MVOLT] [--setvc POINT SCLK SVOLT] [--setsrange SCLKMIN SCLKMAX]
[--setextremum min|max sclk|mclk CLK] [--setmrange MCLKMIN MCLKMAX] [--setfan LEVEL]
[--setperflevel LEVEL] [--setoverdrive %] [--setmemoverdrive %] [--setpoweroverdrive WATTS]
[--setprofile SETPROFILE] [--setperfdeterminism SCLK]
[--setcomputepartition {CPX,SPX,DPX,TPX,QPX,cpx,spx,dpx,tpx,qpx}]
[--setmemorypartition {NPS1,NPS2,NPS4,NPS8,nps1,nps2,nps4,nps8}] [--rasenable BLOCK ERRTYPE]
[--rasdisable BLOCK ERRTYPE] [--rasinject BLOCK] [--gpureset] [--load FILE | --save FILE]
[--autorespond RESPONSE] [--loglevel LEVEL] [--json] [--csv]
AMD ROCm System Management Interface | ROCM-SMI version: 2.0.0+8e78352
optional arguments:
-h, --help show this help message and exit
--gpureset Reset specified GPU (One GPU must be specified)
--load FILE Load Clock, Fan, Performance and Profile settings
from FILE
--save FILE Save Clock, Fan, Performance and Profile settings to
FILE
-V, --version Show version information
-d DEVICE [DEVICE ...], --device DEVICE [DEVICE ...] Execute command on specified device
Display Options:
--alldevices
--showhw Show Hardware details
-a, --showallinfo Show Temperature, Fan and Clock values
Topology:
-i, --showid Show DEVICE ID
-v, --showvbios Show VBIOS version
-e [EVENT [EVENT ...]], --showevents [EVENT [EVENT ...]] Show event list
--showdriverversion Show kernel driver version
--showtempgraph Show Temperature Graph
--showfwinfo [BLOCK [BLOCK ...]] Show FW information
--showmclkrange Show mclk range
--showmemvendor Show GPU memory vendor
--showsclkrange Show sclk range
--showproductname Show SKU/Vendor name
--showserial Show GPU's Serial Number
--showuniqueid Show GPU's Unique ID
--showvoltagerange Show voltage range
--showbus Show PCI bus number
Pages information:
--showpagesinfo Show retired, pending and unreservable pages
--showpendingpages Show pending retired pages
--showretiredpages Show retired pages
--showunreservablepages Show unreservable pages
Hardware-related information:
-f, --showfan Show current fan speed
-P, --showpower Show current Average or Socket Graphics Package Power
Consumption
-t, --showtemp Show current temperature
-u, --showuse Show current GPU use
--showmemuse Show current GPU memory used
--showvoltage Show current GPU voltage
Software-related/controlled information:
-b, --showbw Show estimated PCIe use
-c, --showclocks Show current clock frequencies
-g, --showgpuclocks Show current GPU clock frequencies
-l, --showprofile Show Compute Profile attributes
-M, --showmaxpower Show maximum graphics package power this GPU will
consume
-m, --showmemoverdrive Show current GPU Memory Clock OverDrive level
-o, --showoverdrive Show current GPU Clock OverDrive level
-p, --showperflevel Show current DPM Performance Level
-S, --showclkvolt Show supported GPU and Memory Clocks and Voltages
-s, --showclkfrq Show supported GPU and Memory Clock
--showmeminfo TYPE [TYPE ...] Show Memory usage information for given block(s) TYPE
--showpids [VERBOSE] Show current running KFD PIDs (pass details to
VERBOSE for detailed information)
--showpidgpus [SHOWPIDGPUS [SHOWPIDGPUS ...]] Show GPUs used by specified KFD PIDs (all if no arg
given)
--showreplaycount Show PCIe Replay Count
--showrasinfo [SHOWRASINFO [SHOWRASINFO ...]] Show RAS enablement information and error counts for
the specified block(s) (all if no arg given)
--showvc Show voltage curve
--showxgmierr Show XGMI error information since last read
--showtopo Show hardware topology information
--showtopoaccess Shows the link accessibility between GPUs
--showtopoweight Shows the relative weight between GPUs
--showtopohops Shows the number of hops between GPUs
--showtopotype Shows the link type between GPUs
--showtoponuma Shows the numa nodes
--showenergycounter Energy accumulator that stores amount of energy
consumed
--shownodesbw Shows the numa nodes
--showcomputepartition Shows current compute partitioning
--showmemorypartition Shows current memory partition
Set options:
--setclock TYPE LEVEL Set Clock Frequency Level(s) for specified clock
(requires manual Perf level)
--setsclk LEVEL [LEVEL ...] Set GPU Clock Frequency Level(s) (requires manual
Perf level)
--setmclk LEVEL [LEVEL ...] Set GPU Memory Clock Frequency Level(s) (requires
manual Perf level)
--setpcie LEVEL [LEVEL ...] Set PCIE Clock Frequency Level(s) (requires manual
Perf level)
--setslevel SCLKLEVEL SCLK SVOLT Change GPU Clock frequency (MHz) and Voltage (mV) for
a specific Level
--setmlevel MCLKLEVEL MCLK MVOLT Change GPU Memory clock frequency (MHz) and Voltage
for (mV) a specific Level
--setvc POINT SCLK SVOLT Change SCLK Voltage Curve (MHz mV) for a specific
point
--setsrange SCLKMIN SCLKMAX Set min and max SCLK speed
--setextremum min|max sclk|mclk CLK Set min/max of SCLK/MCLK speed
--setmrange MCLKMIN MCLKMAX Set min and max MCLK speed
--setfan LEVEL Set GPU Fan Speed (Level or %)
--setperflevel LEVEL Set Performance Level
--setoverdrive % Set GPU OverDrive level (requires manual|high Perf
level)
--setmemoverdrive % Set GPU Memory Overclock OverDrive level (requires
manual|high Perf level)
--setpoweroverdrive WATTS Set the maximum GPU power using Power OverDrive in
Watts
--setprofile SETPROFILE Specify Power Profile level (#) or a quoted string of
CUSTOM Profile attributes "# # # #..." (requires
manual Perf level)
--setperfdeterminism SCLK Set clock frequency limit to get minimal performance
variation
--setcomputepartition {CPX,SPX,DPX,TPX,QPX,cpx,spx,dpx,tpx,qpx} Set compute partition
--setmemorypartition {NPS1,NPS2,NPS4,NPS8,nps1,nps2,nps4,nps8} Set memory partition
--rasenable BLOCK ERRTYPE Enable RAS for specified block and error type
--rasdisable BLOCK ERRTYPE Disable RAS for specified block and error type
--rasinject BLOCK Inject RAS poison for specified block (ONLY WORKS ON
UNSECURED BOARDS)
Reset options:
-r, --resetclocks Reset clocks and OverDrive to default
--resetfans Reset fans to automatic (driver) control
--resetprofile Reset Power Profile back to default
--resetpoweroverdrive Set the maximum GPU power back to the device default
state
--resetxgmierr Reset XGMI error count
--resetperfdeterminism Disable performance determinism
Auto-response options:
--autorespond RESPONSE Response to automatically provide for all prompts
(NOT RECOMMENDED)
Output options:
--loglevel LEVEL How much output will be printed for what program is
doing, one of debug/info/warning/error/critical
--json Print output in JSON format
--csv Print output in CSV format
```
## Detailed Option Descriptions
`--setextremum <min/max> <sclk or mclk> <value in MHz to set to>`
Provided ASIC support, users can now set a maximum or minimum sclk or mclk value through our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below.
```shell
$ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100
============================ ROCm System Management Interface ============================
******WARNING******
Operating your AMD GPU outside of official AMD specifications or outside of
factory settings, including but not limited to the conducting of overclocking,
over-volting or under-volting (including use of this interface software,
even if such software has been directly or indirectly provided by AMD or otherwise
affiliated in any way with AMD), may cause damage to your AMD GPU, system components
and/or result in system failure, as well as cause other problems.
DAMAGES CAUSED BY USE OF YOUR AMD GPU OUTSIDE OF OFFICIAL AMD SPECIFICATIONS OR
OUTSIDE OF FACTORY SETTINGS ARE NOT COVERED UNDER ANY AMD PRODUCT WARRANTY AND
MAY NOT BE COVERED BY YOUR BOARD OR SYSTEM MANUFACTURER'S WARRANTY.
Use this utility with caution.
Do you accept these terms? [y/N] y
================================ Set Valid sclk Extremum =================================
GPU[0] : Successfully set max sclk to 2100(MHz)
GPU[1] : Successfully set max sclk to 2100(MHz)
GPU[2] : Successfully set max sclk to 2100(MHz)
GPU[3] : Successfully set max sclk to 2100(MHz)
================================== End of ROCm SMI Log ===================================
```
--setsclk/--setmclk # [# # ...]:
This allows you to set a mask for the levels. For example, if a GPU has 8 clock levels,
you can set a mask to use levels 0, 5, 6 and 7 with --setsclk 0 5 6 7 . This will only
use the base level, and the top 3 clock levels. This will allow you to keep the GPU at
base level when there is no GPU load, and the top 3 levels when the GPU load increases.
NOTES:
The clock levels will change dynamically based on GPU load based on the default
Compute and Graphics profiles. The thresholds and delays for a custom mask cannot
be controlled through the SMI tool
This flag automatically sets the Performance Level to "manual" as the mask is not
applied when the Performance level is set to auto
--setfan LEVEL:
This sets the fan speed to a value ranging from 0 to maxlevel, or from 0%-100%
If the level ends with a %, the fan speed is calculated as pct*maxlevel/100
(maxlevel is usually 255, but is determined by the ASIC)
NOTE: While the hardware is usually capable of overriding this value when required, it is
recommended to not set the fan level lower than the default value for extended periods
of time
--setperflevel LEVEL:
This lets you use the pre-defined Performance Level values for clocks and power profile, which can include:
auto (Automatically change values based on GPU workload)
low (Keep values low, regardless of workload)
high (Keep values high, regardless of workload)
manual (Only use values defined by --setsclk and --setmclk)
--setoverdrive/--setmemoverdrive #:
***DEPRECATED IN NEWER KERNEL VERSIONS (use --setslevel/--setmlevel instead)***
This sets the percentage above maximum for the max Performance Level.
For example, --setoverdrive 20 will increase the top sclk level by 20%, similarly
--setmemoverdrive 20 will increase the top mclk level by 20%. Thus if the maximum
clock level is 1000MHz, then --setoverdrive 20 will increase the maximum clock to 1200MHz
NOTES:
This option can be used in conjunction with the --setsclk/--setmclk mask
Operating the GPU outside of specifications can cause irreparable damage to your hardware
Observe the warning displayed when using this option
This flag automatically sets the clock to the highest level, as only the highest level is
increased by the OverDrive value
--setpoweroverdrive/--resetpoweroverdrive #:
This allows users to change the maximum power available to a GPU package.
The input value is in Watts. This limit is enforced by the hardware, and
some cards allow users to set it to a higher value than the default that
ships with the GPU. This Power OverDrive mode allows the GPU to run at
higher frequencies for longer periods of time, though this may mean the
GPU uses more power than it is allowed to use per power supply
specifications. Each GPU has a model-specific maximum Power OverDrive that
is will take; attempting to set a higher limit than that will cause this
command to fail.
NOTES:
Operating the GPU outside of specifications can cause irreparable damage to your hardware
Observe the warning displayed when using this option
--setprofile SETPROFILE:
The Compute Profile accepts 1 or n parameters, either the Profile to select (see --showprofile for a list
of preset Power Profiles) or a quoted string of values for the CUSTOM profile.
NOTE: These values can vary based on the ASIC, and may include:
| Setting | Description |
|---------------------|----------------------------------------------------|
| SCLK_PROFILE_ENABLE | Whether or not to apply the 3 following SCLK settings (0=disable,1=enable) |
| | **NOTE: This is a hidden field. If set to 0, the following 3 values are displayed as '-** |
| SCLK_UP_HYST | Delay before sclk is increased (in milliseconds) |
| SCLK_DOWN_HYST | Delay before sclk is decresed (in milliseconds) |
| SCLK_ACTIVE_LEVEL | Workload required before sclk levels change (in %) |
| MCLK_PROFILE_ENABLE | Whether or not to apply the 3 following MCLK settings (0=disable,1=enable) |
| | **NOTE: This is a hidden field. If set to 0, the following 3 values are displayed as '-'** |
| MCLK_UP_HYST | Delay before mclk is increased (in milliseconds) |
| MCLK_DOWN_HYST | Delay before mclk is decresed (in milliseconds) |
| MCLK_ACTIVE_LEVEL | Workload required before mclk levels change (in %) |
Other settings:
| Setting | Description |
|------------------|---------------------------------------------------------------------------|
| BUSY_SET_POINT | Threshold for raw activity level before levels change |
| FPS | Frames Per Second |
| USE_RLC_BUSY | When set to 1, DPM is switched up as long as RLC busy message is received |
| MIN_ACTIVE_LEVEL | Workload required before levels change (in %) |
NOTES:
When a compute queue is detected, the COMPUTE Power Profile values will be automatically
applied to the system, provided that the Perf Level is set to "auto"
The CUSTOM Power Profile is only applied when the Performance Level is set to "manual"
so using this flag will automatically set the performance level to "manual"
It is not possible to modify the non-CUSTOM Profiles. These are hard-coded by the kernel
-P, --showpower:
Show average or instantaneous socket graphics package power consumption
"Graphics Package" refers to the GPU plus any HBM (High-Bandwidth memory) modules, if present
-M, --showmaxpower:
Show the maximum Graphics Package power that the GPU will attempt to consume.
This limit is enforced by the hardware.
--loglevel:
This will allow the user to set a logging level for the SMI's actions. Currently this is
only implemented for sysfs writes, but can easily be expanded upon in the future to log
other things from the SMI
--showmeminfo:
This allows the user to see the amount of used and total memory for a given block (vram,
vis_vram, gtt). It returns the number of bytes used and total number of bytes for each block
'all' can be passed as a field to return all blocks, otherwise a quoted-string is used for
multiple values (e.g. "vram vis_vram")
vram refers to the Video RAM, or graphics memory, on the specified device
vis_vram refers to Visible VRAM, which is the CPU-accessible video memory on the device
gtt refers to the Graphics Translation Table
-b, --showbw:
This shows an approximation of the number of bytes received and sent by the GPU over
the last second through the PCIe bus. Note that this will not work for APUs since data for
the GPU portion of the APU goes through the memory fabric and does not 'enter/exit'
the chip via the PCIe interface, thus no accesses are generated, and the performance
counters can't count accesses that are not generated.
NOTE: It is not possible to easily grab the size of every packet that is transmitted
in real time, so the kernel estimates the bandwidth by taking the maximum payload size (mps),
which is the max size that a PCIe packet can be. and multiplies it by the number of packets
received and sent. This means that the SMI will report the maximum estimated bandwidth,
the actual usage could (and likely will be) less
--showrasinfo:
This shows the RAS information for a given block. This includes enablement of the block
(currently GFX, SDMA and UMC are the only supported blocks) and the number of errors
ue - Uncorrectable errors
ce - Correctable errors
## Clock Type Descriptions
| Clock type | Description |
| ---------- | --- |
| DCEFCLK | DCE (Display) |
| FCLK | Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe |
| SCLK | GFXCLK (Graphics core) |
| | **Note - SOCCLK split from SCLK as of Vega10. Pre-Vega10 they were both controlled by SCLK** |
| MCLK | GPU Memory (VRAM) |
| PCLK | PCIe bus |
| | **Note - This gives 2 speeds, PCIe Gen1 x1 and the highest available based on the hardware** |
| SOCCLK | System clock (VG10 and later) - Data Fabric (DF), MM HUB, AT HUB, SYSTEM HUB, OSS, DFD |
| | **Note - DF split from SOCCLK as of Vega20. Pre-Vega20 they were both controlled by SOCCLK** |
--gpureset:
This flag will attempt to reset the GPU for a specified device. This will invoke the GPU reset through
the kernel debugfs file amdgpu_gpu_recover. Note that GPU reset will not always work, depending on the
manner in which the GPU is hung.
--showdriverversion:
This flag will print out the AMDGPU module version for amdgpu-pro or ROCm kernels. For other kernels,
it will simply print out the name of the kernel (`uname -r`)
--showserial:
This flag will print out the serial number for the graphics card
NOTE: This is currently only supported on Vega20 server cards that support it. Consumer cards and
cards older than Vega20 will not support this feature.
--showproductname:
This uses the pci.ids file to print out more information regarding the GPUs on the system.
'update-pciids' may need to be executed on the machine to get the latest PCI ID snapshot,
as certain newer GPUs will not be present in the stock pci.ids file, and the file may even
be absent on certain OS installation types
--showpagesinfo | --showretiredpages | --showpendingpages | --showunreservablepages:
These flags display the different "bad pages" as reported by the kernel. The three
types of pages are:
Retired pages (reserved pages) - These pages are reserved and are unable to be used
Pending pages - These pages are pending for reservation, and will be reserved/retired
Unreservable pages - These pages are not reservable for some reason
--showmemuse | --showuse | --showmeminfo:
--showuse and --showmemuse are used to indicate how busy the respective blocks are. For
example, for --showuse (gpu_busy_percent sysfs file), the SMU samples every ms or so to see
if any GPU block (RLC, MEC, PFP, CP) is busy. If so, that's 1 (or high). If not, that's 0 (low).
If we have 5 high and 5 low samples, that means 50% utilization (50% GPU busy, or 50% GPU use).
The windows and sampling vary from generation to generation, but that is how GPU and VRAM use
is calculated in a generic sense.
--showmeminfo (and VRAM% in concise output) will show the amount of VRAM used (visible, total, GTT),
as well as the total available for those partitions. The percentage shown there indicates the
amount of used memory in terms of current allocations
## OverDrive settings
Enabling OverDrive requires both a card that support OverDrive and a driver parameter that enables its use.
Because OverDrive features can damage your card, most workstation and server GPUs cannot use OverDrive.
Consumer GPUs that can use OverDrive must enable this feature by setting bit 14 in the amdgpu driver's
ppfeaturemask module parameter
For OverDrive functionality, the OverDrive bit (bit 14) must be enabled (by default, the
OverDrive bit is disabled on the ROCK and upstream kernels). This can be done by setting
amdgpu.ppfeaturemask accordingly in the kernel parameters, or by changing the default value
inside amdgpu_drv.c (if building your own kernel).
As an example, if the ppfeaturemask is set to 0xffffbfff (11111111111111111011111111111111),
then enabling the OverDrive bit would make it 0xffffffff (11111111111111111111111111111111).
These are the flags that require OverDrive functionality to be enabled for the flag to work:
--showclkvolt
--showvoltagerange
--showvc
--showsclkrange
--showmclkrange
--setslevel
--setmlevel
--setoverdrive
--setpoweroverdrive
--resetpoweroverdrive
--setvc
--setsrange
--setmrange
## Disclaimer
The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein.
AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All rights reserved.
+16
View File
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Build manpages using ronn
set -eu
set -o pipefail
if ! command -v ronn > /dev/null; then
echo "ERROR: no ronn found!" >&2
echo "Please follow installation instructions here:" >&2
echo "https://github.com/apjanke/ronn-ng" >&2
exit 1
fi
set -x
ronn ./README.md
+1
View File
@@ -0,0 +1 @@
rocm_smi.py
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,756 @@
#!/usr/bin/env python3
"""ROCm_SMI_LIB CLI Tool Python Bindings"""
# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library!
# TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy
from __future__ import print_function
from ctypes import *
from enum import Enum
import sys
if 'sphinx' in sys.modules:
path_librocm = str()
def initRsmiBindings(silent=False):
# Empty function for document generation
exit()
SMI_HASH = '@PKG_VERSION_HASH@'
else:
from rsmiBindingsInit import *
# Device ID
dv_id = c_uint64()
# GPU ID
gpu_id = c_uint32(0)
# Policy enums
RSMI_MAX_NUM_FREQUENCIES = 33
RSMI_MAX_FAN_SPEED = 255
RSMI_NUM_VOLTAGE_CURVE_POINTS = 3
class rsmi_status_t(c_int):
RSMI_STATUS_SUCCESS = 0x0
RSMI_STATUS_INVALID_ARGS = 0x1
RSMI_STATUS_NOT_SUPPORTED = 0x2
RSMI_STATUS_FILE_ERROR = 0x3
RSMI_STATUS_PERMISSION = 0x4
RSMI_STATUS_OUT_OF_RESOURCES = 0x5
RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
RSMI_STATUS_INIT_ERROR = 0x8
RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
RSMI_STATUS_NOT_FOUND = 0xA
RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
RSMI_STATUS_INTERRUPT = 0xC
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
RSMI_STATUS_NO_DATA = 0xE
RSMI_STATUS_UNEXPECTED_DATA = 0xF
RSMI_STATUS_BUSY = 0x10
RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11
RSMI_STATUS_SETTING_UNAVAILABLE = 0x12
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x13
RSMI_STATUS_DRM_ERROR = 0x14
RSMI_STATUS_FAIL_LOAD_MODULE = 0x15
RSMI_STATUS_FAIL_LOAD_SYMBOL = 0x16
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
#Dictionary of rsmi ret codes and it's verbose output
rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute',
rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX',
rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE: 'Requested setting is unavailable for current device',
rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver',
rsmi_status_t.RSMI_STATUS_DRM_ERROR: 'Error when calling libdrm',
rsmi_status_t.RSMI_STATUS_FAIL_LOAD_MODULE: 'Failed to load a library',
rsmi_status_t.RSMI_STATUS_FAIL_LOAD_SYMBOL: 'Failed to load a library symbol',
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
}
class rsmi_init_flags_t(c_int):
RSMI_INIT_FLAG_ALL_GPUS = 0x1
class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_AUTO = 0
RSMI_DEV_PERF_LEVEL_FIRST = RSMI_DEV_PERF_LEVEL_AUTO
RSMI_DEV_PERF_LEVEL_LOW = 1
RSMI_DEV_PERF_LEVEL_HIGH = 2
RSMI_DEV_PERF_LEVEL_MANUAL = 3
RSMI_DEV_PERF_LEVEL_STABLE_STD = 4
RSMI_DEV_PERF_LEVEL_STABLE_PEAK = 5
RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK = 6
RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK = 7
RSMI_DEV_PERF_LEVEL_DETERMINISM = 8
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_DETERMINISM
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG']
class rsmi_evt_notification_type_t(c_int):
RSMI_EVT_NOTIF_NONE = 0
RSMI_EVT_NOTIF_VMFAULT = 1
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3
RSMI_EVT_NOTIF_GPU_POST_RESET = 4
RSMI_EVT_NOTIF_RING_HANG = 5
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
class rsmi_voltage_metric_t(c_int):
RSMI_VOLT_CURRENT = 0
RSMI_VOLT_FIRST = RSMI_VOLT_CURRENT
RSMI_VOLT_MAX = 1
RSMI_VOLT_MIN_CRIT = 2
RSMI_VOLT_MIN = 3
RSMI_VOLT_MAX_CRIT = 4
RSMI_VOLT_AVERAGE = 5
RSMI_VOLT_LOWEST = 6
RSMI_VOLT_HIGHEST = 7
RSMI_VOLT_LAST = RSMI_VOLT_HIGHEST
RSMI_VOLT_UNKNOWN = 0x100
class rsmi_voltage_type_t(c_int):
RSMI_VOLT_TYPE_FIRST = 0
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX
RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF
# The perf_level_string is correlated to rsmi_dev_perf_level_t
def perf_level_string(i):
switcher = {
0: 'AUTO',
1: 'LOW',
2: 'HIGH',
3: 'MANUAL',
4: 'STABLE_STD',
5: 'STABLE_PEAK',
6: 'STABLE_MIN_MCLK',
7: 'STABLE_MIN_SCLK',
8: 'PERF_DETERMINISM',
}
return switcher.get(i, 'UNKNOWN')
rsmi_dev_perf_level = rsmi_dev_perf_level_t
class rsmi_sw_component_t(c_int):
RSMI_SW_COMP_FIRST = 0x0
RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
rsmi_event_handle_t = POINTER(c_uint)
class rsmi_event_group_t(Enum):
RSMI_EVNT_GRP_XGMI = 0
RSMI_EVNT_GRP_XGMI_DATA_OUT = 10
RSMI_EVNT_GRP_INVALID = 0xFFFFFFFF
class rsmi_event_type_t(c_int):
RSMI_EVNT_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI
RSMI_EVNT_XGMI_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI
RSMI_EVNT_XGMI_0_NOP_TX = RSMI_EVNT_XGMI_FIRST
RSMI_EVNT_XGMI_0_REQUEST_TX = 1
RSMI_EVNT_XGMI_0_RESPONSE_TX = 2
RSMI_EVNT_XGMI_0_BEATS_TX = 3
RSMI_EVNT_XGMI_1_NOP_TX = 4
RSMI_EVNT_XGMI_1_REQUEST_TX = 5
RSMI_EVNT_XGMI_1_RESPONSE_TX = 6
RSMI_EVNT_XGMI_1_BEATS_TX = 7
RSMI_EVNT_XGMI_LAST = RSMI_EVNT_XGMI_1_BEATS_TX
RSMI_EVNT_XGMI_DATA_OUT_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI_DATA_OUT
RSMI_EVNT_XGMI_DATA_OUT_0 = RSMI_EVNT_XGMI_DATA_OUT_FIRST
RSMI_EVNT_XGMI_DATA_OUT_1 = 11
RSMI_EVNT_XGMI_DATA_OUT_2 = 12
RSMI_EVNT_XGMI_DATA_OUT_3 = 13
RSMI_EVNT_XGMI_DATA_OUT_4 = 14
RSMI_EVNT_XGMI_DATA_OUT_5 = 15
RSMI_EVNT_XGMI_DATA_OUT_LAST = RSMI_EVNT_XGMI_DATA_OUT_5
RSMI_EVNT_LAST = RSMI_EVNT_XGMI_DATA_OUT_LAST,
class rsmi_counter_command_t(c_int):
RSMI_CNTR_CMD_START = 0
RSMI_CNTR_CMD_STOP = 1
class rsmi_counter_value_t(Structure):
_fields_ = [('value', c_uint64),
('time_enabled', c_uint64),
('time_running', c_uint64)]
class rsmi_clk_type_t(c_int):
RSMI_CLK_TYPE_SYS = 0x0
RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS
RSMI_CLK_TYPE_DF = 0x1
RSMI_CLK_TYPE_DCEF = 0x2
RSMI_CLK_TYPE_SOC = 0x3
RSMI_CLK_TYPE_MEM = 0x4
RSMI_CLK_TYPE_LAST = RSMI_CLK_TYPE_MEM
RSMI_CLK_INVALID = 0xFFFFFFFF
# Clock names here are correlated to the rsmi_clk_type_t values above
clk_type_names = ['sclk', 'sclk', 'fclk', 'dcefclk',\
'socclk', 'mclk', 'mclk', 'invalid']
rsmi_clk_type_dict = {'RSMI_CLK_TYPE_SYS': 0x0, 'RSMI_CLK_TYPE_FIRST': 0x0,\
'RSMI_CLK_TYPE_DF': 0x1, 'RSMI_CLK_TYPE_DCEF': 0x2,\
'RSMI_CLK_TYPE_SOC': 0x3, 'RSMI_CLK_TYPE_MEM': 0x4,\
'RSMI_CLK_TYPE_LAST': 0X4, 'RSMI_CLK_INVALID': 0xFFFFFFFF}
rsmi_clk_names_dict = {'sclk': 0x0, 'fclk': 0x1, 'dcefclk': 0x2,\
'socclk': 0x3, 'mclk': 0x4}
rsmi_clk_type = rsmi_clk_type_t
class rsmi_temperature_metric_t(c_int):
RSMI_TEMP_CURRENT = 0x0
RSMI_TEMP_FIRST = RSMI_TEMP_CURRENT
RSMI_TEMP_MAX = 0x1
RSMI_TEMP_MIN = 0x2
RSMI_TEMP_MAX_HYST = 0x3
RSMI_TEMP_MIN_HYST = 0x4
RSMI_TEMP_CRITICAL = 0x5
RSMI_TEMP_CRITICAL_HYST = 0x6
RSMI_TEMP_EMERGENCY = 0x7
RSMI_TEMP_EMERGENCY_HYST = 0x8
RSMI_TEMP_CRIT_MIN = 0x9
RSMI_TEMP_CRIT_MIN_HYST = 0xA
RSMI_TEMP_OFFSET = 0xB
RSMI_TEMP_LOWEST = 0xC
RSMI_TEMP_HIGHEST = 0xD
RSMI_TEMP_LAST = RSMI_TEMP_HIGHEST
rsmi_temperature_metric = rsmi_temperature_metric_t
class rsmi_temperature_type_t(c_int):
RSMI_TEMP_TYPE_FIRST = 0
RSMI_TEMP_TYPE_EDGE = RSMI_TEMP_TYPE_FIRST
RSMI_TEMP_TYPE_JUNCTION = 1
RSMI_TEMP_TYPE_MEMORY = 2
RSMI_TEMP_TYPE_HBM_0 = 3
RSMI_TEMP_TYPE_HBM_1 = 4
RSMI_TEMP_TYPE_HBM_2 = 5
RSMI_TEMP_TYPE_HBM_3 = 6
RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3
# temp_type_lst list correlates to rsmi_temperature_type_t
temp_type_lst = ['edge', 'junction', 'memory', 'HBM 0', 'HBM 1', 'HBM 2', 'HBM 3']
class rsmi_power_profile_preset_masks_t(c_uint64):
RSMI_PWR_PROF_PRST_CUSTOM_MASK = 0x1
RSMI_PWR_PROF_PRST_VIDEO_MASK = 0x2
RSMI_PWR_PROF_PRST_POWER_SAVING_MASK = 0x4
RSMI_PWR_PROF_PRST_COMPUTE_MASK = 0x8
RSMI_PWR_PROF_PRST_VR_MASK = 0x10
RSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK = 0x20
RSMI_PWR_PROF_PRST_BOOTUP_DEFAULT = 0x40
RSMI_PWR_PROF_PRST_LAST = RSMI_PWR_PROF_PRST_BOOTUP_DEFAULT
RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF
rsmi_power_profile_preset_masks = rsmi_power_profile_preset_masks_t
class rsmi_gpu_block_t(c_int):
RSMI_GPU_BLOCK_INVALID = 0x0000000000000000
RSMI_GPU_BLOCK_FIRST = 0x0000000000000001
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST
RSMI_GPU_BLOCK_SDMA = 0x0000000000000002
RSMI_GPU_BLOCK_GFX = 0x0000000000000004
RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008
RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010
RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020
RSMI_GPU_BLOCK_HDP = 0x0000000000000040
RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080
RSMI_GPU_BLOCK_DF = 0x0000000000000100
RSMI_GPU_BLOCK_SMN = 0x0000000000000200
RSMI_GPU_BLOCK_SEM = 0x0000000000000400
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
rsmi_gpu_block = rsmi_gpu_block_t
# The following dictionary correlates with rsmi_gpu_block_t enum
rsmi_gpu_block_d = {
'UMC' : 0x0000000000000001,
'SDMA' : 0x0000000000000002,
'GFX' : 0x0000000000000004,
'MMHUB': 0x0000000000000008,
'ATHUB': 0x0000000000000010,
'PCIE_BIF': 0x0000000000000020,
'HDP': 0x0000000000000040,
'XGMI_WAFL': 0x0000000000000080,
'DF': 0x0000000000000100,
'SMN': 0x0000000000000200,
'SEM': 0x0000000000000400,
'MP0': 0x0000000000000800,
'MP1': 0x0000000000001000,
'FUSE': 0x0000000000002000
}
class rsmi_ras_err_state_t(c_int):
RSMI_RAS_ERR_STATE_NONE = 0
RSMI_RAS_ERR_STATE_DISABLED = 1
RSMI_RAS_ERR_STATE_PARITY = 2
RSMI_RAS_ERR_STATE_SING_C = 3
RSMI_RAS_ERR_STATE_MULT_UC = 4
RSMI_RAS_ERR_STATE_POISON = 5
RSMI_RAS_ERR_STATE_ENABLED = 6
RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_ENABLED
RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF
# Error type list correlates to rsmi_ras_err_state_t
rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled',
'unknown type err', 'single correctable err',
'multiple uncorrectable err',
'page isolated, treat as uncorrectable err',
'ECC enabled', 'status invalid']
rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error',
'sing', 'mult', 'position', 'enabled']
validRasTypes = ['ue', 'ce']
validRasActions = ['disable', 'enable', 'inject']
validRasBlocks = ['fuse', 'mp1', 'mp0', 'sem', 'smn', 'df', 'xgmi_wafl', 'hdp', 'pcie_bif',
'athub', 'mmhub', 'gfx', 'sdma', 'umc']
class rsmi_memory_type_t(c_int):
RSMI_MEM_TYPE_FIRST = 0
RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
RSMI_MEM_TYPE_VIS_VRAM = 1
RSMI_MEM_TYPE_GTT = 2
RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
# memory_type_l includes names for with rsmi_memory_type_t
# Usage example to get corresponding names:
# memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
class rsmi_freq_ind_t(c_int):
RSMI_FREQ_IND_MIN = 0
RSMI_FREQ_IND_MAX = 1
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF
rsmi_freq_ind = rsmi_freq_ind_t
class rsmi_fw_block_t(c_int):
RSMI_FW_BLOCK_FIRST = 0
RSMI_FW_BLOCK_ASD = RSMI_FW_BLOCK_FIRST
RSMI_FW_BLOCK_CE = 1
RSMI_FW_BLOCK_DMCU = 2
RSMI_FW_BLOCK_MC = 3
RSMI_FW_BLOCK_ME = 4
RSMI_FW_BLOCK_MEC = 5
RSMI_FW_BLOCK_MEC2 = 6
RSMI_FW_BLOCK_MES = 7
RSMI_FW_BLOCK_MES_KIQ = 8
RSMI_FW_BLOCK_PFP = 9
RSMI_FW_BLOCK_RLC = 10
RSMI_FW_BLOCK_RLC_SRLC = 11
RSMI_FW_BLOCK_RLC_SRLG = 12
RSMI_FW_BLOCK_RLC_SRLS = 13
RSMI_FW_BLOCK_SDMA = 14
RSMI_FW_BLOCK_SDMA2 = 15
RSMI_FW_BLOCK_SMC = 16
RSMI_FW_BLOCK_SOS = 17
RSMI_FW_BLOCK_TA_RAS = 18
RSMI_FW_BLOCK_TA_XGMI = 19
RSMI_FW_BLOCK_UVD = 20
RSMI_FW_BLOCK_VCE = 21
RSMI_FW_BLOCK_VCN = 22
RSMI_FW_BLOCK_LAST = RSMI_FW_BLOCK_VCN
# The following list correlated to the rsmi_fw_block_t
fw_block_names_l = ['ASD', 'CE', 'DMCU', 'MC', 'ME', 'MEC', 'MEC2', 'MES', 'MES KIQ', 'PFP',\
'RLC', 'RLC SRLC', 'RLC SRLG', 'RLC SRLS', 'SDMA', 'SDMA2',\
'SMC', 'SOS', 'TA RAS', 'TA XGMI', 'UVD', 'VCE', 'VCN']
rsmi_bit_field_t = c_uint64()
rsmi_bit_field = rsmi_bit_field_t
class rsmi_utilization_counter_type(c_int):
RSMI_UTILIZATION_COUNTER_FIRST = 0
RSMI_COARSE_GRAIN_GFX_ACTIVITY = RSMI_UTILIZATION_COUNTER_FIRST
RSMI_COARSE_GRAIN_MEM_ACTIVITY = 1
RSMI_UTILIZATION_COUNTER_LAST = RSMI_COARSE_GRAIN_MEM_ACTIVITY
utilization_counter_name = ['GFX Activity', 'Memory Activity']
class rsmi_utilization_counter_t(Structure):
_fields_ = [('type', c_int),
('val', c_uint64)]
class rsmi_xgmi_status_t(c_int):
RSMI_XGMI_STATUS_NO_ERRORS = 0
RSMI_XGMI_STATUS_ERROR = 1
RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
class rsmi_memory_page_status_t(c_int):
RSMI_MEM_PAGE_STATUS_RESERVED = 0
RSMI_MEM_PAGE_STATUS_PENDING = 1
RSMI_MEM_PAGE_STATUS_UNRESERVABLE = 2
memory_page_status_l = ['reserved', 'pending', 'unreservable']
class rsmi_retired_page_record_t(Structure):
_fields_ = [('page_address', c_uint64),
('page_size', c_uint64),
('status', c_int)]
RSMI_MAX_NUM_POWER_PROFILES = (sizeof(rsmi_bit_field_t) * 8)
class rsmi_power_profile_status_t(Structure):
_fields_ = [('available_profiles', c_uint32),
('current', c_uint64),
('num_profiles', c_uint32)]
rsmi_power_profile_status = rsmi_power_profile_status_t
class rsmi_frequencies_t(Structure):
_fields_ = [('has_deep_sleep', c_bool),
('num_supported', c_int32),
('current', c_uint32),
('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
rsmi_frequencies = rsmi_frequencies_t
class rsmi_pcie_bandwidth_t(Structure):
_fields_ = [('transfer_rate', rsmi_frequencies_t),
('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
rsmi_pcie_bandwidth = rsmi_pcie_bandwidth_t
class rsmi_version_t(Structure):
_fields_ = [('major', c_uint32),
('minor', c_uint32),
('patch', c_uint32),
('build', c_char_p)]
rsmi_version = rsmi_version_t
class rsmi_range_t(Structure):
_fields_ = [('lower_bound', c_uint64),
('upper_bound', c_uint64)]
rsmi_range = rsmi_range_t
class rsmi_od_vddc_point_t(Structure):
_fields_ = [('frequency', c_uint64),
('voltage', c_uint64)]
rsmi_od_vddc_point = rsmi_od_vddc_point_t
class rsmi_freq_volt_region_t(Structure):
_fields_ = [('freq_range', rsmi_range_t),
('volt_range', rsmi_range_t)]
rsmi_freq_volt_region = rsmi_freq_volt_region_t
class rsmi_od_volt_curve_t(Structure):
_fields_ = [('vc_points', rsmi_od_vddc_point_t *\
RSMI_NUM_VOLTAGE_CURVE_POINTS)]
rsmi_od_volt_curve = rsmi_od_volt_curve_t
class rsmi_od_volt_freq_data_t(Structure):
_fields_ = [('curr_sclk_range', rsmi_range_t),
('curr_mclk_range', rsmi_range_t),
('sclk_freq_limits', rsmi_range_t),
('mclk_freq_limits', rsmi_range_t),
('curve', rsmi_od_volt_curve_t),
('num_regions', c_uint32)]
rsmi_od_volt_freq_data = rsmi_od_volt_freq_data_t
class rsmi_error_count_t(Structure):
_fields_ = [('correctable_err', c_uint64),
('uncorrectable_err', c_uint64)]
class rsmi_evt_notification_data_t(Structure):
_fields_ = [('dv_ind', c_uint32),
('event', rsmi_evt_notification_type_t),
('message', c_char*64)]
class rsmi_process_info_t(Structure):
_fields_ = [('process_id', c_uint32),
('pasid', c_uint32),
('vram_usage', c_uint64),
('sdma_usage', c_uint64),
('cu_occupancy', c_uint32)]
class rsmi_func_id_iter_handle(Structure):
_fields_ = [('func_id_iter', POINTER(c_uint)),
('container_ptr', POINTER(c_uint)),
('id_type', c_uint32)]
rsmi_func_id_iter_handle_t = POINTER(rsmi_func_id_iter_handle)
RSMI_DEFAULT_VARIANT = 0xFFFFFFFFFFFFFFFF
class submodule_union(Union):
_fields_ = [('memory_type', c_int), # rsmi_memory_type_t,
('temp_metric', c_int), # rsmi_temperature_metric_t,
('evnt_type', c_int), # rsmi_event_type_t,
('evnt_group', c_int), # rsmi_event_group_t,
('clk_type', c_int), # rsmi_clk_type_t,
('fw_block', c_int), # rsmi_fw_block_t,
('gpu_block_type', c_int)] # rsmi_gpu_block_t
class rsmi_func_id_value_t(Union):
_fields_ = [('id', c_uint64),
('name', c_char_p),
('submodule', submodule_union)]
class rsmi_compute_partition_type_t(c_int):
RSMI_COMPUTE_PARTITION_INVALID = 0
RSMI_COMPUTE_PARTITION_SPX = 1
RSMI_COMPUTE_PARTITION_DPX = 2
RSMI_COMPUTE_PARTITION_TPX = 3
RSMI_COMPUTE_PARTITION_QPX = 4
RSMI_COMPUTE_PARTITION_CPX = 5
rsmi_compute_partition_type_dict = {
#'RSMI_COMPUTE_PARTITION_INVALID': 0,
'SPX': 1,
'DPX': 2,
'TPX': 3,
'QPX': 4,
'CPX': 5,
}
rsmi_compute_partition_type = rsmi_compute_partition_type_t
# compute_partition_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
# will return string 'CPX'
compute_partition_type_l = ['SPX', 'DPX', 'TPX', 'QPX', 'CPX']
class rsmi_memory_partition_type_t(c_int):
RSMI_MEMORY_PARTITION_UNKNOWN = 0
RSMI_MEMORY_PARTITION_NPS1 = 1
RSMI_MEMORY_PARTITION_NPS2 = 2
RSMI_MEMORY_PARTITION_NPS4 = 3
RSMI_MEMORY_PARTITION_NPS8 = 4
rsmi_memory_partition_type_dict = {
'NPS1': 1,
'NPS2': 2,
'NPS4': 3,
'NPS8': 4
}
rsmi_memory_partition_type = rsmi_memory_partition_type_t
# memory_partition_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# memory_partition_type_l[rsmi_memory_partition_type_t.RSMI_MEMORY_PARTITION_NPS2]
# will return string 'NPS2'
memory_partition_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8']
class rsmi_power_label(str, Enum):
AVG_POWER = '(Avg)'
CURRENT_SOCKET_POWER = '(Socket)'
class rsmi_power_type_t(c_int):
RSMI_AVERAGE_POWER = 0,
RSMI_CURRENT_POWER = 1,
RSMI_INVALID_POWER = 0xFFFFFFFF
rsmi_power_type_dict = {
0: 'AVERAGE',
1: 'CURRENT SOCKET',
0xFFFFFFFF: 'INVALID_POWER_TYPE'
}
class metrics_table_header_t(Structure):
pass
# metrics_table_header_t._pack_ = 1 # source:False
metrics_table_header_t._fields_ = [
('structure_size', c_uint16),
('format_revision', c_uint8),
('content_revision', c_uint8),
]
amd_metrics_table_header_t = metrics_table_header_t
class amdgpu_xcp_metrics_t(Structure):
pass
# amdgpu_xcp_metrics_t._pack_ = 1 # source:False
amdgpu_xcp_metrics_t._fields_ = [
('gfx_busy_inst', c_uint32 * 8),
('jpeg_busy', c_uint16 * 40),
('vcn_busy', c_uint16 * 4),
('gfx_busy_acc', c_uint64 * 8),
('gfx_below_host_limit_acc', c_uint64 * 8),
('gfx_below_host_limit_ppt_acc', c_uint64 * 8),
('gfx_below_host_limit_thm_acc', c_uint64 * 8),
('gfx_low_utilization_acc', c_uint64 * 8),
('gfx_below_host_limit_total_acc', c_uint64 * 8),
]
xcp_stats_t = amdgpu_xcp_metrics_t
class rsmi_gpu_metrics_t(Structure):
pass
# rsmi_gpu_metrics_t._pack_ = 1 # source:False
rsmi_gpu_metrics_t._fields_ = [
('common_header', amd_metrics_table_header_t),
('temperature_edge', c_uint16),
('temperature_hotspot', c_uint16),
('temperature_mem', c_uint16),
('temperature_vrgfx', c_uint16),
('temperature_vrsoc', c_uint16),
('temperature_vrmem', c_uint16),
('average_gfx_activity', c_uint16),
('average_umc_activity', c_uint16),
('average_mm_activity', c_uint16),
('average_socket_power', c_uint16),
('energy_accumulator', c_uint64),
('system_clock_counter', c_uint64),
('average_gfxclk_frequency', c_uint16),
('average_socclk_frequency', c_uint16),
('average_uclk_frequency', c_uint16),
('average_vclk0_frequency', c_uint16),
('average_dclk0_frequency', c_uint16),
('average_vclk1_frequency', c_uint16),
('average_dclk1_frequency', c_uint16),
('current_gfxclk', c_uint16),
('current_socclk', c_uint16),
('current_uclk', c_uint16),
('current_vclk0', c_uint16),
('current_dclk0', c_uint16),
('current_vclk1', c_uint16),
('current_dclk1', c_uint16),
('throttle_status', c_uint32),
('current_fan_speed', c_uint16),
('pcie_link_width', c_uint16),
('pcie_link_speed', c_uint16),
('gfx_activity_acc', c_uint32),
('mem_activity_acc', c_uint32),
('temperature_hbm', c_uint16 * 4),
('firmware_timestamp', c_uint64),
('voltage_soc', c_uint16),
('voltage_gfx', c_uint16),
('voltage_mem', c_uint16),
('indep_throttle_status', c_uint64),
('current_socket_power', c_uint16),
('vcn_activity', c_uint16 * 4),
('gfxclk_lock_status', c_uint32),
('xgmi_link_width', c_uint16),
('xgmi_link_speed', c_uint16),
('pcie_bandwidth_acc', c_uint64),
('pcie_bandwidth_inst', c_uint64),
('pcie_l0_to_recov_count_acc', c_uint64),
('pcie_replay_count_acc', c_uint64),
('pcie_replay_rover_count_acc', c_uint64),
('xgmi_read_data_acc', c_uint64 * 8),
('xgmi_write_data_acc', c_uint64 * 8),
('current_gfxclks', c_uint16 * 8),
('current_socclks', c_uint16 * 4),
('current_vclk0s', c_uint16 * 4),
('current_dclk0s', c_uint16 * 4),
('jpeg_activity', c_uint16 * 32),
('pcie_nak_sent_count_acc', c_uint32),
('pcie_nak_rcvd_count_acc', c_uint32),
('accumulation_counter', c_uint64),
('prochot_residency_acc', c_uint64),
('ppt_residency_acc', c_uint64),
('socket_thm_residency_acc', c_uint64),
('vr_thm_residency_acc', c_uint64),
('hbm_thm_residency_acc', c_uint64),
('num_partition', c_uint16),
('xcp_stats', xcp_stats_t * 8),
('pcie_lc_perf_other_end_recovery', c_uint32),
('vram_max_bandwidth', c_uint64),
('xgmi_link_status', c_uint16 * 8),
]
amdsmi_gpu_metrics_t = rsmi_gpu_metrics_t
@@ -0,0 +1,676 @@
#!/usr/bin/env python3
"""ROCm_SMI_LIB CLI Tool Python Bindings"""
# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library!
# TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy
from __future__ import print_function
import ctypes.util
from ctypes import *
from enum import Enum
import os
# Use ROCm installation path if running from standard installation
# With File Reorg rsmiBindings.py will be installed in /opt/rocm/libexec/rocm_smi.
# relative path changed accordingly.
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
#
# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode.
path_librocm = str()
def initRsmiBindings(silent=False):
def print_silent(*args):
if not silent:
print(args)
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
if (rocm_smi_lib_path != None):
path_librocm = rocm_smi_lib_path
else:
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
if not os.path.isfile(path_librocm):
print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm)
for root, dirs, files in os.walk('/opt', followlinks=True):
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
if os.path.isfile(path_librocm):
print_silent('Using lib from %s' % path_librocm)
else:
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
# ----------> TODO: Support static libs as well as SO
try:
cdll.LoadLibrary(path_librocm)
return CDLL(path_librocm)
except OSError:
print('Unable to load the rocm_smi library.\n'\
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
'{0}Please refer to https://github.com/'\
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
.format('\33[33m', '\033[0m'))
exit()
# Device ID
dv_id = c_uint64()
# GPU ID
gpu_id = c_uint32(0)
SMI_HASH = '@PKG_VERSION_HASH@'
# Policy enums
RSMI_MAX_NUM_FREQUENCIES = 33
RSMI_MAX_FAN_SPEED = 255
RSMI_NUM_VOLTAGE_CURVE_POINTS = 3
class rsmi_status_t(c_int):
RSMI_STATUS_SUCCESS = 0x0
RSMI_STATUS_INVALID_ARGS = 0x1
RSMI_STATUS_NOT_SUPPORTED = 0x2
RSMI_STATUS_FILE_ERROR = 0x3
RSMI_STATUS_PERMISSION = 0x4
RSMI_STATUS_OUT_OF_RESOURCES = 0x5
RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
RSMI_STATUS_INIT_ERROR = 0x8
RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
RSMI_STATUS_NOT_FOUND = 0xA
RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
RSMI_STATUS_INTERRUPT = 0xC
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
RSMI_STATUS_NO_DATA = 0xE
RSMI_STATUS_UNEXPECTED_DATA = 0xF
RSMI_STATUS_BUSY = 0x10
RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11
RSMI_STATUS_SETTING_UNAVAILABLE = 0x12
RSMI_STATUS_AMDGPU_RESTART_ERR = 0x13
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
#Dictionary of rsmi ret codes and it's verbose output
rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occurred during rsmi initialization',
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occurred during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received',
rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute',
rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX',
rsmi_status_t.RSMI_STATUS_SETTING_UNAVAILABLE: 'Requested setting is unavailable for current device',
rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver',
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
}
class rsmi_init_flags_t(c_int):
RSMI_INIT_FLAG_ALL_GPUS = 0x1
class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_AUTO = 0
RSMI_DEV_PERF_LEVEL_FIRST = RSMI_DEV_PERF_LEVEL_AUTO
RSMI_DEV_PERF_LEVEL_LOW = 1
RSMI_DEV_PERF_LEVEL_HIGH = 2
RSMI_DEV_PERF_LEVEL_MANUAL = 3
RSMI_DEV_PERF_LEVEL_STABLE_STD = 4
RSMI_DEV_PERF_LEVEL_STABLE_PEAK = 5
RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK = 6
RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK = 7
RSMI_DEV_PERF_LEVEL_DETERMINISM = 8
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_DETERMINISM
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG']
class rsmi_evt_notification_type_t(c_int):
RSMI_EVT_NOTIF_NONE = 0
RSMI_EVT_NOTIF_VMFAULT = 1
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3
RSMI_EVT_NOTIF_GPU_POST_RESET = 4
RSMI_EVT_NOTIF_RING_HANG = 5
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG
class rsmi_voltage_metric_t(c_int):
RSMI_VOLT_CURRENT = 0
RSMI_VOLT_FIRST = RSMI_VOLT_CURRENT
RSMI_VOLT_MAX = 1
RSMI_VOLT_MIN_CRIT = 2
RSMI_VOLT_MIN = 3
RSMI_VOLT_MAX_CRIT = 4
RSMI_VOLT_AVERAGE = 5
RSMI_VOLT_LOWEST = 6
RSMI_VOLT_HIGHEST = 7
RSMI_VOLT_LAST = RSMI_VOLT_HIGHEST
RSMI_VOLT_UNKNOWN = 0x100
class rsmi_voltage_type_t(c_int):
RSMI_VOLT_TYPE_FIRST = 0
RSMI_VOLT_TYPE_VDDGFX = RSMI_VOLT_TYPE_FIRST
RSMI_VOLT_TYPE_LAST = RSMI_VOLT_TYPE_VDDGFX
RSMI_VOLT_TYPE_INVALID = 0xFFFFFFFF
# The perf_level_string is correlated to rsmi_dev_perf_level_t
def perf_level_string(i):
switcher = {
0: 'AUTO',
1: 'LOW',
2: 'HIGH',
3: 'MANUAL',
4: 'STABLE_STD',
5: 'STABLE_PEAK',
6: 'STABLE_MIN_MCLK',
7: 'STABLE_MIN_SCLK',
8: 'PERF_DETERMINISM',
}
return switcher.get(i, 'UNKNOWN')
rsmi_dev_perf_level = rsmi_dev_perf_level_t
class rsmi_sw_component_t(c_int):
RSMI_SW_COMP_FIRST = 0x0
RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
rsmi_event_handle_t = POINTER(c_uint)
class rsmi_event_group_t(Enum):
RSMI_EVNT_GRP_XGMI = 0
RSMI_EVNT_GRP_XGMI_DATA_OUT = 10
RSMI_EVNT_GRP_INVALID = 0xFFFFFFFF
class rsmi_event_type_t(c_int):
RSMI_EVNT_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI
RSMI_EVNT_XGMI_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI
RSMI_EVNT_XGMI_0_NOP_TX = RSMI_EVNT_XGMI_FIRST
RSMI_EVNT_XGMI_0_REQUEST_TX = 1
RSMI_EVNT_XGMI_0_RESPONSE_TX = 2
RSMI_EVNT_XGMI_0_BEATS_TX = 3
RSMI_EVNT_XGMI_1_NOP_TX = 4
RSMI_EVNT_XGMI_1_REQUEST_TX = 5
RSMI_EVNT_XGMI_1_RESPONSE_TX = 6
RSMI_EVNT_XGMI_1_BEATS_TX = 7
RSMI_EVNT_XGMI_LAST = RSMI_EVNT_XGMI_1_BEATS_TX
RSMI_EVNT_XGMI_DATA_OUT_FIRST = rsmi_event_group_t.RSMI_EVNT_GRP_XGMI_DATA_OUT
RSMI_EVNT_XGMI_DATA_OUT_0 = RSMI_EVNT_XGMI_DATA_OUT_FIRST
RSMI_EVNT_XGMI_DATA_OUT_1 = 11
RSMI_EVNT_XGMI_DATA_OUT_2 = 12
RSMI_EVNT_XGMI_DATA_OUT_3 = 13
RSMI_EVNT_XGMI_DATA_OUT_4 = 14
RSMI_EVNT_XGMI_DATA_OUT_5 = 15
RSMI_EVNT_XGMI_DATA_OUT_LAST = RSMI_EVNT_XGMI_DATA_OUT_5
RSMI_EVNT_LAST = RSMI_EVNT_XGMI_DATA_OUT_LAST,
class rsmi_counter_command_t(c_int):
RSMI_CNTR_CMD_START = 0
RSMI_CNTR_CMD_STOP = 1
class rsmi_counter_value_t(Structure):
_fields_ = [('value', c_uint64),
('time_enabled', c_uint64),
('time_running', c_uint64)]
class rsmi_clk_type_t(c_int):
RSMI_CLK_TYPE_SYS = 0x0
RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS
RSMI_CLK_TYPE_DF = 0x1
RSMI_CLK_TYPE_DCEF = 0x2
RSMI_CLK_TYPE_SOC = 0x3
RSMI_CLK_TYPE_MEM = 0x4
RSMI_CLK_TYPE_LAST = RSMI_CLK_TYPE_MEM
RSMI_CLK_INVALID = 0xFFFFFFFF
# Clock names here are correlated to the rsmi_clk_type_t values above
clk_type_names = ['sclk', 'sclk', 'fclk', 'dcefclk',\
'socclk', 'mclk', 'mclk', 'invalid']
rsmi_clk_type_dict = {'RSMI_CLK_TYPE_SYS': 0x0, 'RSMI_CLK_TYPE_FIRST': 0x0,\
'RSMI_CLK_TYPE_DF': 0x1, 'RSMI_CLK_TYPE_DCEF': 0x2,\
'RSMI_CLK_TYPE_SOC': 0x3, 'RSMI_CLK_TYPE_MEM': 0x4,\
'RSMI_CLK_TYPE_LAST': 0X4, 'RSMI_CLK_INVALID': 0xFFFFFFFF}
rsmi_clk_names_dict = {'sclk': 0x0, 'fclk': 0x1, 'dcefclk': 0x2,\
'socclk': 0x3, 'mclk': 0x4}
rsmi_clk_type = rsmi_clk_type_t
class rsmi_temperature_metric_t(c_int):
RSMI_TEMP_CURRENT = 0x0
RSMI_TEMP_FIRST = RSMI_TEMP_CURRENT
RSMI_TEMP_MAX = 0x1
RSMI_TEMP_MIN = 0x2
RSMI_TEMP_MAX_HYST = 0x3
RSMI_TEMP_MIN_HYST = 0x4
RSMI_TEMP_CRITICAL = 0x5
RSMI_TEMP_CRITICAL_HYST = 0x6
RSMI_TEMP_EMERGENCY = 0x7
RSMI_TEMP_EMERGENCY_HYST = 0x8
RSMI_TEMP_CRIT_MIN = 0x9
RSMI_TEMP_CRIT_MIN_HYST = 0xA
RSMI_TEMP_OFFSET = 0xB
RSMI_TEMP_LOWEST = 0xC
RSMI_TEMP_HIGHEST = 0xD
RSMI_TEMP_LAST = RSMI_TEMP_HIGHEST
rsmi_temperature_metric = rsmi_temperature_metric_t
class rsmi_temperature_type_t(c_int):
RSMI_TEMP_TYPE_FIRST = 0
RSMI_TEMP_TYPE_EDGE = RSMI_TEMP_TYPE_FIRST
RSMI_TEMP_TYPE_JUNCTION = 1
RSMI_TEMP_TYPE_MEMORY = 2
RSMI_TEMP_TYPE_HBM_0 = 3
RSMI_TEMP_TYPE_HBM_1 = 4
RSMI_TEMP_TYPE_HBM_2 = 5
RSMI_TEMP_TYPE_HBM_3 = 6
RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_HBM_3
# temp_type_lst list correlates to rsmi_temperature_type_t
temp_type_lst = ['edge', 'junction', 'memory', 'HBM 0', 'HBM 1', 'HBM 2', 'HBM 3']
class rsmi_power_profile_preset_masks_t(c_uint64):
RSMI_PWR_PROF_PRST_CUSTOM_MASK = 0x1
RSMI_PWR_PROF_PRST_VIDEO_MASK = 0x2
RSMI_PWR_PROF_PRST_POWER_SAVING_MASK = 0x4
RSMI_PWR_PROF_PRST_COMPUTE_MASK = 0x8
RSMI_PWR_PROF_PRST_VR_MASK = 0x10
RSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK = 0x20
RSMI_PWR_PROF_PRST_BOOTUP_DEFAULT = 0x40
RSMI_PWR_PROF_PRST_LAST = RSMI_PWR_PROF_PRST_BOOTUP_DEFAULT
RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF
rsmi_power_profile_preset_masks = rsmi_power_profile_preset_masks_t
class rsmi_gpu_block_t(c_int):
RSMI_GPU_BLOCK_INVALID = 0x0000000000000000
RSMI_GPU_BLOCK_FIRST = 0x0000000000000001
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST
RSMI_GPU_BLOCK_SDMA = 0x0000000000000002
RSMI_GPU_BLOCK_GFX = 0x0000000000000004
RSMI_GPU_BLOCK_MMHUB = 0x0000000000000008
RSMI_GPU_BLOCK_ATHUB = 0x0000000000000010
RSMI_GPU_BLOCK_PCIE_BIF = 0x0000000000000020
RSMI_GPU_BLOCK_HDP = 0x0000000000000040
RSMI_GPU_BLOCK_XGMI_WAFL = 0x0000000000000080
RSMI_GPU_BLOCK_DF = 0x0000000000000100
RSMI_GPU_BLOCK_SMN = 0x0000000000000200
RSMI_GPU_BLOCK_SEM = 0x0000000000000400
RSMI_GPU_BLOCK_MP0 = 0x0000000000000800
RSMI_GPU_BLOCK_MP1 = 0x0000000000001000
RSMI_GPU_BLOCK_FUSE = 0x0000000000002000
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_FUSE
RSMI_GPU_BLOCK_RESERVED = 0x8000000000000000
rsmi_gpu_block = rsmi_gpu_block_t
# The following dictionary correlates with rsmi_gpu_block_t enum
rsmi_gpu_block_d = {
'UMC' : 0x0000000000000001,
'SDMA' : 0x0000000000000002,
'GFX' : 0x0000000000000004,
'MMHUB': 0x0000000000000008,
'ATHUB': 0x0000000000000010,
'PCIE_BIF': 0x0000000000000020,
'HDP': 0x0000000000000040,
'XGMI_WAFL': 0x0000000000000080,
'DF': 0x0000000000000100,
'SMN': 0x0000000000000200,
'SEM': 0x0000000000000400,
'MP0': 0x0000000000000800,
'MP1': 0x0000000000001000,
'FUSE': 0x0000000000002000
}
class rsmi_ras_err_state_t(c_int):
RSMI_RAS_ERR_STATE_NONE = 0
RSMI_RAS_ERR_STATE_DISABLED = 1
RSMI_RAS_ERR_STATE_PARITY = 2
RSMI_RAS_ERR_STATE_SING_C = 3
RSMI_RAS_ERR_STATE_MULT_UC = 4
RSMI_RAS_ERR_STATE_POISON = 5
RSMI_RAS_ERR_STATE_ENABLED = 6
RSMI_RAS_ERR_STATE_LAST = RSMI_RAS_ERR_STATE_ENABLED
RSMI_RAS_ERR_STATE_INVALID = 0xFFFFFFFF
# Error type list correlates to rsmi_ras_err_state_t
rsmi_ras_err_stale_readable = ['no errors', 'ECC disabled',
'unknown type err', 'single correctable err',
'multiple uncorrectable err',
'page isolated, treat as uncorrectable err',
'ECC enabled', 'status invalid']
rsmi_ras_err_stale_machine = ['none', 'disabled', 'unknown error',
'sing', 'mult', 'position', 'enabled']
validRasTypes = ['ue', 'ce']
validRasActions = ['disable', 'enable', 'inject']
validRasBlocks = ['fuse', 'mp1', 'mp0', 'sem', 'smn', 'df', 'xgmi_wafl', 'hdp', 'pcie_bif',
'athub', 'mmhub', 'gfx', 'sdma', 'umc']
class rsmi_memory_type_t(c_int):
RSMI_MEM_TYPE_FIRST = 0
RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
RSMI_MEM_TYPE_VIS_VRAM = 1
RSMI_MEM_TYPE_GTT = 2
RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
# memory_type_l includes names for with rsmi_memory_type_t
# Usage example to get corresponding names:
# memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
class rsmi_freq_ind_t(c_int):
RSMI_FREQ_IND_MIN = 0
RSMI_FREQ_IND_MAX = 1
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF
rsmi_freq_ind = rsmi_freq_ind_t
class rsmi_fw_block_t(c_int):
RSMI_FW_BLOCK_FIRST = 0
RSMI_FW_BLOCK_ASD = RSMI_FW_BLOCK_FIRST
RSMI_FW_BLOCK_CE = 1
RSMI_FW_BLOCK_DMCU = 2
RSMI_FW_BLOCK_MC = 3
RSMI_FW_BLOCK_ME = 4
RSMI_FW_BLOCK_MEC = 5
RSMI_FW_BLOCK_MEC2 = 6
RSMI_FW_BLOCK_MES = 7
RSMI_FW_BLOCK_MES_KIQ = 8
RSMI_FW_BLOCK_PFP = 9
RSMI_FW_BLOCK_RLC = 10
RSMI_FW_BLOCK_RLC_SRLC = 11
RSMI_FW_BLOCK_RLC_SRLG = 12
RSMI_FW_BLOCK_RLC_SRLS = 13
RSMI_FW_BLOCK_SDMA = 14
RSMI_FW_BLOCK_SDMA2 = 15
RSMI_FW_BLOCK_SMC = 16
RSMI_FW_BLOCK_SOS = 17
RSMI_FW_BLOCK_TA_RAS = 18
RSMI_FW_BLOCK_TA_XGMI = 19
RSMI_FW_BLOCK_UVD = 20
RSMI_FW_BLOCK_VCE = 21
RSMI_FW_BLOCK_VCN = 22
RSMI_FW_BLOCK_LAST = RSMI_FW_BLOCK_VCN
# The following list correlated to the rsmi_fw_block_t
fw_block_names_l = ['ASD', 'CE', 'DMCU', 'MC', 'ME', 'MEC', 'MEC2', 'MES', 'MES KIQ', 'PFP',\
'RLC', 'RLC SRLC', 'RLC SRLG', 'RLC SRLS', 'SDMA', 'SDMA2',\
'SMC', 'SOS', 'TA RAS', 'TA XGMI', 'UVD', 'VCE', 'VCN']
rsmi_bit_field_t = c_uint64()
rsmi_bit_field = rsmi_bit_field_t
class rsmi_utilization_counter_type(c_int):
RSMI_UTILIZATION_COUNTER_FIRST = 0
RSMI_COARSE_GRAIN_GFX_ACTIVITY = RSMI_UTILIZATION_COUNTER_FIRST
RSMI_COARSE_GRAIN_MEM_ACTIVITY = 1
RSMI_UTILIZATION_COUNTER_LAST = RSMI_COARSE_GRAIN_MEM_ACTIVITY
utilization_counter_name = ['GFX Activity', 'Memory Activity']
class rsmi_utilization_counter_t(Structure):
_fields_ = [('type', c_int),
('val', c_uint64)]
class rsmi_xgmi_status_t(c_int):
RSMI_XGMI_STATUS_NO_ERRORS = 0
RSMI_XGMI_STATUS_ERROR = 1
RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
class rsmi_memory_page_status_t(c_int):
RSMI_MEM_PAGE_STATUS_RESERVED = 0
RSMI_MEM_PAGE_STATUS_PENDING = 1
RSMI_MEM_PAGE_STATUS_UNRESERVABLE = 2
memory_page_status_l = ['reserved', 'pending', 'unreservable']
class rsmi_retired_page_record_t(Structure):
_fields_ = [('page_address', c_uint64),
('page_size', c_uint64),
('status', c_int)]
RSMI_MAX_NUM_POWER_PROFILES = (sizeof(rsmi_bit_field_t) * 8)
class rsmi_power_profile_status_t(Structure):
_fields_ = [('available_profiles', c_uint32),
('current', c_uint64),
('num_profiles', c_uint32)]
rsmi_power_profile_status = rsmi_power_profile_status_t
class rsmi_frequencies_t(Structure):
_fields_ = [('has_deep_sleep', c_bool),
('num_supported', c_int32),
('current', c_uint32),
('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
rsmi_frequencies = rsmi_frequencies_t
class rsmi_pcie_bandwidth_t(Structure):
_fields_ = [('transfer_rate', rsmi_frequencies_t),
('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
rsmi_pcie_bandwidth = rsmi_pcie_bandwidth_t
class rsmi_version_t(Structure):
_fields_ = [('major', c_uint32),
('minor', c_uint32),
('patch', c_uint32),
('build', c_char_p)]
rsmi_version = rsmi_version_t
class rsmi_range_t(Structure):
_fields_ = [('lower_bound', c_uint64),
('upper_bound', c_uint64)]
rsmi_range = rsmi_range_t
class rsmi_od_vddc_point_t(Structure):
_fields_ = [('frequency', c_uint64),
('voltage', c_uint64)]
rsmi_od_vddc_point = rsmi_od_vddc_point_t
class rsmi_freq_volt_region_t(Structure):
_fields_ = [('freq_range', rsmi_range_t),
('volt_range', rsmi_range_t)]
rsmi_freq_volt_region = rsmi_freq_volt_region_t
class rsmi_od_volt_curve_t(Structure):
_fields_ = [('vc_points', rsmi_od_vddc_point_t *\
RSMI_NUM_VOLTAGE_CURVE_POINTS)]
rsmi_od_volt_curve = rsmi_od_volt_curve_t
class rsmi_od_volt_freq_data_t(Structure):
_fields_ = [('curr_sclk_range', rsmi_range_t),
('curr_mclk_range', rsmi_range_t),
('sclk_freq_limits', rsmi_range_t),
('mclk_freq_limits', rsmi_range_t),
('curve', rsmi_od_volt_curve_t),
('num_regions', c_uint32)]
rsmi_od_volt_freq_data = rsmi_od_volt_freq_data_t
class rsmi_error_count_t(Structure):
_fields_ = [('correctable_err', c_uint64),
('uncorrectable_err', c_uint64)]
class rsmi_evt_notification_data_t(Structure):
_fields_ = [('dv_ind', c_uint32),
('event', rsmi_evt_notification_type_t),
('message', c_char*64)]
class rsmi_process_info_t(Structure):
_fields_ = [('process_id', c_uint32),
('pasid', c_uint32),
('vram_usage', c_uint64),
('sdma_usage', c_uint64),
('cu_occupancy', c_uint32)]
class rsmi_func_id_iter_handle(Structure):
_fields_ = [('func_id_iter', POINTER(c_uint)),
('container_ptr', POINTER(c_uint)),
('id_type', c_uint32)]
rsmi_func_id_iter_handle_t = POINTER(rsmi_func_id_iter_handle)
RSMI_DEFAULT_VARIANT = 0xFFFFFFFFFFFFFFFF
class submodule_union(Union):
_fields_ = [('memory_type', c_int), # rsmi_memory_type_t,
('temp_metric', c_int), # rsmi_temperature_metric_t,
('evnt_type', c_int), # rsmi_event_type_t,
('evnt_group', c_int), # rsmi_event_group_t,
('clk_type', c_int), # rsmi_clk_type_t,
('fw_block', c_int), # rsmi_fw_block_t,
('gpu_block_type', c_int)] # rsmi_gpu_block_t
class rsmi_func_id_value_t(Union):
_fields_ = [('id', c_uint64),
('name', c_char_p),
('submodule', submodule_union)]
class rsmi_compute_partition_type_t(c_int):
RSMI_COMPUTE_PARTITION_INVALID = 0
RSMI_COMPUTE_PARTITION_CPX = 1
RSMI_COMPUTE_PARTITION_SPX = 2
RSMI_COMPUTE_PARTITION_DPX = 3
RSMI_COMPUTE_PARTITION_TPX = 4
RSMI_COMPUTE_PARTITION_QPX = 5
rsmi_compute_partition_type_dict = {
#'RSMI_COMPUTE_PARTITION_INVALID': 0,
'CPX': 1,
'SPX': 2,
'DPX': 3,
'TPX': 4,
'QPX': 5
}
rsmi_compute_partition_type = rsmi_compute_partition_type_t
# compute_partition_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
# will return string 'CPX'
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
class rsmi_memory_partition_type_t(c_int):
RSMI_MEMORY_PARTITION_UNKNOWN = 0
RSMI_MEMORY_PARTITION_NPS1 = 1
RSMI_MEMORY_PARTITION_NPS2 = 2
RSMI_MEMORY_PARTITION_NPS4 = 3
RSMI_MEMORY_PARTITION_NPS8 = 4
rsmi_memory_partition_type_dict = {
'NPS1': 1,
'NPS2': 2,
'NPS4': 3,
'NPS8': 4
}
rsmi_memory_partition_type = rsmi_memory_partition_type_t
# memory_partition_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# memory_partition_type_l[rsmi_memory_partition_type_t.RSMI_MEMORY_PARTITION_NPS2]
# will return string 'NPS2'
memory_partition_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8']
class rsmi_power_label(str, Enum):
AVG_POWER = '(Avg)'
CURRENT_SOCKET_POWER = '(Socket)'
class rsmi_power_type_t(c_int):
RSMI_AVERAGE_POWER = 0,
RSMI_CURRENT_POWER = 1,
RSMI_INVALID_POWER = 0xFFFFFFFF
rsmi_power_type_dict = {
0: 'AVERAGE',
1: 'CURRENT SOCKET',
0xFFFFFFFF: 'INVALID_POWER_TYPE'
}
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
"""ROCm_SMI_LIB CLI Tool Python Bindings"""
# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library!
# TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy
from __future__ import print_function
import ctypes.util
from ctypes import *
from enum import Enum
import os
# Use ROCm installation path if running from standard installation
# With File Reorg rsmiBindings.py and rsmiBindingsInit.py will be installed in
# /opt/rocm/libexec/rocm_smi. relative path changed accordingly.
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
#
# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode.
path_librocm = str()
def initRsmiBindings(silent=False):
def print_silent(*args):
if not silent:
print(args)
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
if (rocm_smi_lib_path != None):
path_librocm = rocm_smi_lib_path
else:
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
if not os.path.isfile(path_librocm):
print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm)
for root, dirs, files in os.walk('/opt', followlinks=True):
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
if os.path.isfile(path_librocm):
print_silent('Using lib from %s' % path_librocm)
else:
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
# ----------> TODO: Support static libs as well as SO
try:
cdll.LoadLibrary(path_librocm)
return CDLL(path_librocm)
except OSError:
print('Unable to load the rocm_smi library.\n'\
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
'{0}Please refer to https://github.com/'\
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
.format('\33[33m', '\033[0m'))
exit()
SMI_HASH = '@PKG_VERSION_HASH@'
@@ -0,0 +1,27 @@
# - Config file for the rocm_smi package
# It defines the following variables
# ROCM_SMI_INCLUDE_DIRS - include directories for rocm_smi
# ROCM_SMI_LIBRARIES - libraries to link against
# Compute paths
@PACKAGE_INIT@
get_filename_component(ROCM_SMI_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
set_and_check( rocm_smi_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
set_and_check( ROCM_SMI_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
set_and_check( ROCM_SMI_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIR@" )
set_and_check( rocm_smi_LIB_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
set_and_check( ROCM_SMI_LIB_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
set_and_check( ROCM_SMI_LIB_DIRS "@PACKAGE_LIB_INSTALL_DIR@" )
# Our library dependencies (contains definitions for IMPORTED targets)
if(NOT TARGET rocm_smi AND NOT rocm_smi_BINARY_DIR)
include("${ROCM_SMI_CMAKE_DIR}/rocm_smiTargets.cmake")
endif()
# These are IMPORTED targets created by RocmSmiTargets.cmake
# TODO: Need to check if OAM libraries are needed here!
set(ROCM_SMI_LIBRARIES rocm_smi64)
set(ROCM_SMI_LIBRARY rocm_smi64)
check_required_components(rocm_smi)
+151
View File
@@ -0,0 +1,151 @@
#
# Minimum version of cmake required
#
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" CMake ROCm SMI (Library) ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
## Verbose output.
set(CMAKE_VERBOSE_MAKEFILE on)
# Required Defines first:
message("")
message("Build Configuration:")
# message("-----------BuildType: " ${CMAKE_BUILD_TYPE})
# message("------------Compiler: " ${CMAKE_CXX_COMPILER})
# message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
# message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
# message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
# message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
# message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
# message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
# message("")
set(ROCM_SMI "rocm_smi")
set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}")
set(ROCM_SMI_TARGET "${ROCM_SMI}64")
## Include common cmake modules
include(utils)
################# Determine the library version #########################
set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver")
# VERSION_* variables should be set by get_version_from_tag
message("Package version: ${PKG_VERSION_STR}")
# Debian package specific variables
# Set a default value for the package version
get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
# VERSION_* variables should be set by get_version_from_tag
if ( ${ROCM_PATCH_VERSION} )
set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
else()
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
endif ()
set(${ROCM_SMI}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
set(${ROCM_SMI}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
set(${ROCM_SMI}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}")
set(${ROCM_SMI}_VERSION_BUILD "0")
set(${ROCM_SMI}_VERSION_HASH "${PKG_VERSION_HASH}")
message("SOVERSION: ${SO_VERSION_STRING}")
# Configure rsmiBindingsInit.py.in with SO major version:
configure_file(
"${COMMON_SRC_ROOT}/python_smi_tools/rsmiBindingsInit.py.in"
"${COMMON_SRC_ROOT}/python_smi_tools/rsmiBindingsInit.py")
# Create a configure file to get version info from within library
configure_file(
"${PROJECT_SOURCE_DIR}/src/${ROCM_SMI_TARGET}Config.in"
"${PROJECT_SOURCE_DIR}/include/rocm_smi/${ROCM_SMI_TARGET}Config.h")
set(RSMI_SRC_DIR "src")
set(RSMI_INC_DIR "include")
set(RSMI_DOCS_DIR "docs")
# Add any rocm_smi_lib specific source files here
set(SMI_SRC_LIST ${CMN_SRC_LIST})
# Add any rocm_smi_lib specific headers here
set(SMI_INC_LIST "")
set(SMI_EXAMPLE_EXE "rocm_smi_ex")
add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc")
target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET})
add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST}
${CMN_INC_LIST} ${SMI_INC_LIST})
target_link_libraries(${ROCM_SMI_TARGET} PRIVATE pthread rt dl)
target_include_directories(${ROCM_SMI_TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex)
# use the target_include_directories() command to specify the include directories for the target
target_include_directories(${ROCM_SMI_TARGET}
PUBLIC
"$<BUILD_INTERFACE:${DRM_INCLUDE_DIRS}>"
"$<BUILD_INTERFACE:${AMDGPU_DRM_INCLUDE_DIRS}>"
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)
target_include_directories(${ROCM_SMI_TARGET} INTERFACE ${DRM_INCLUDE_DIRS})
## Set the VERSION and SOVERSION values
set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY
SOVERSION "${VERSION_MAJOR}")
set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY
VERSION "${SO_VERSION_STRING}")
## If the library is a release, strip the target library
if ("${CMAKE_BUILD_TYPE}" STREQUAL Release)
if(${BUILD_SHARED_LIBS}) #stripping only for .so
add_custom_command(
TARGET ${ROCM_SMI_TARGET}
POST_BUILD COMMAND ${CMAKE_STRIP} lib${ROCM_SMI_TARGET}.so.${SO_VERSION_STRING})
endif()
endif ()
#file reorganization changes
#rocm_smi.py moved to libexec/rocm_smi. so creating rocm-smi symlink
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
add_custom_target(link-rocm-smi ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../${CMAKE_INSTALL_LIBEXECDIR}/${ROCM_SMI}/rocm_smi.py ${CMAKE_CURRENT_BINARY_DIR}/bin/rocm-smi)
## Add the install directives for the runtime library.
install(TARGETS ${ROCM_SMI_TARGET}
EXPORT rocm_smiTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev)
install(TARGETS ${ROCM_SMI_TARGET}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
COMPONENT asan)
install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/rocm_smi.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocm_smi
COMPONENT dev)
install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/${ROCM_SMI_TARGET}Config.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocm_smi
COMPONENT dev)
install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/kfd_ioctl.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocm_smi
COMPONENT dev)
install(PROGRAMS ${COMMON_SRC_ROOT}/python_smi_tools/rsmiBindingsInit.py
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${ROCM_SMI}
COMPONENT dev)
install(PROGRAMS ${COMMON_SRC_ROOT}/python_smi_tools/rsmiBindings.py
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${ROCM_SMI}
COMPONENT dev)
install(PROGRAMS ${COMMON_SRC_ROOT}/python_smi_tools/rocm_smi.py
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${ROCM_SMI}
COMPONENT dev)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/bin/rocm-smi
DESTINATION ${CMAKE_INSTALL_BINDIR}
COMPONENT dev)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+6651
View File
File diff suppressed because it is too large Load Diff
+57
View File
@@ -0,0 +1,57 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
// This file is generated on build.
#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
#define rocm_smi_VERSION_HASH "@rocm_smi_VERSION_HASH@"
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
+438
View File
@@ -0,0 +1,438 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <unordered_set>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_counters.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
namespace amd {
namespace smi {
namespace evt {
static const char *kPathDeviceEventRoot = "/sys/bus/event_source/devices";
// Event group names
static const char *kEvGrpDataFabricFName = "amdgpu_df_#";
static const char *kEvGrpAmdGpuFName = "amdgpu_#";
// Data Fabric event file names
static const char *kDFEvtCake0FtiReqAllocFName = "cake0_ftiinstat_reqalloc";
static const char *kDFEvtCake0FtiRspAllocFName = "cake0_ftiinstat_rspalloc";
static const char *kDFEvtCake0PcsOutTxDataFName = "cake0_pcsout_txdata";
static const char *kDFEvtCake0PcsOutTxMetaFName = "cake0_pcsout_txmeta";
static const char *kDFEvtCake1FtiReqAllocFName = "cake1_ftiinstat_reqalloc";
static const char *kDFEvtCake1FtiRspAllocFName = "cake1_ftiinstat_rspalloc";
static const char *kDFEvtCake1PcsOutTxDataFName = "cake1_pcsout_txdata";
static const char *kDFEvtCake1PcsOutTxMetaFName = "cake1_pcsout_txmeta";
// XGMI Data Outbound event file names
static const char *kXGMIDOutBound0FName = "xgmi_link0_data_outbound";
static const char *kXGMIDOutBound1FName = "xgmi_link1_data_outbound";
static const char *kXGMIDOutBound2FName = "xgmi_link2_data_outbound";
static const char *kXGMIDOutBound3FName = "xgmi_link3_data_outbound";
static const char *kXGMIDOutBound4FName = "xgmi_link4_data_outbound";
static const char *kXGMIDOutBound5FName = "xgmi_link5_data_outbound";
static const std::map<rsmi_event_type_t, const char *> kEventFNameMap = {
{RSMI_EVNT_XGMI_0_NOP_TX, kDFEvtCake0PcsOutTxMetaFName},
{RSMI_EVNT_XGMI_0_REQUEST_TX, kDFEvtCake0FtiReqAllocFName},
{RSMI_EVNT_XGMI_0_RESPONSE_TX, kDFEvtCake0FtiRspAllocFName},
{RSMI_EVNT_XGMI_0_BEATS_TX, kDFEvtCake0PcsOutTxDataFName},
{RSMI_EVNT_XGMI_1_NOP_TX, kDFEvtCake1PcsOutTxMetaFName},
{RSMI_EVNT_XGMI_1_REQUEST_TX, kDFEvtCake1FtiReqAllocFName},
{RSMI_EVNT_XGMI_1_RESPONSE_TX, kDFEvtCake1FtiRspAllocFName},
{RSMI_EVNT_XGMI_1_BEATS_TX, kDFEvtCake1PcsOutTxDataFName},
{RSMI_EVNT_XGMI_DATA_OUT_0, kXGMIDOutBound0FName},
{RSMI_EVNT_XGMI_DATA_OUT_1, kXGMIDOutBound1FName},
{RSMI_EVNT_XGMI_DATA_OUT_2, kXGMIDOutBound2FName},
{RSMI_EVNT_XGMI_DATA_OUT_3, kXGMIDOutBound3FName},
{RSMI_EVNT_XGMI_DATA_OUT_4, kXGMIDOutBound4FName},
{RSMI_EVNT_XGMI_DATA_OUT_5, kXGMIDOutBound5FName},
};
static const std::map<rsmi_event_group_t, const char *> kEvtGrpFNameMap = {
{RSMI_EVNT_GRP_XGMI, kEvGrpDataFabricFName},
{RSMI_EVNT_GRP_XGMI_DATA_OUT, kEvGrpAmdGpuFName},
{RSMI_EVNT_GRP_INVALID, "bogus"},
};
static rsmi_event_group_t EvtGrpFromEvtID(rsmi_event_type_t evnt) {
#define EVNT_GRP_RANGE_CHK(EVGRP_SHORT, EVGRP_ENUM) \
if (evnt >= RSMI_EVNT_##EVGRP_SHORT##_FIRST && \
evnt <= RSMI_EVNT_##EVGRP_SHORT##_LAST) { \
return EVGRP_ENUM; \
}
EVNT_GRP_RANGE_CHK(XGMI, RSMI_EVNT_GRP_XGMI);
EVNT_GRP_RANGE_CHK(XGMI_DATA_OUT, RSMI_EVNT_GRP_XGMI_DATA_OUT);
return RSMI_EVNT_GRP_INVALID;
}
// Note below that dev_num is not the same as the usual dv_ind.
// dev_num is the number of the device (e.g., 1 for card1) whereas dv_ind
// is usually the index into the vector of devices
void
GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) {
assert(supported_grps != nullptr);
std::string grp_path_base;
std::string grp_path;
int32_t ret;
grp_path_base = kPathDeviceEventRoot;
grp_path_base += '/';
struct stat file_stat;
for (auto g : kEvtGrpFNameMap) {
grp_path = grp_path_base;
grp_path += g.second;
std::replace(grp_path.begin(), grp_path.end(), '#',
static_cast<char>('0' + dev_num));
ret = stat(grp_path.c_str(), &file_stat);
if (ret) {
assert(errno == ENOENT);
continue;
}
if (S_ISDIR(file_stat.st_mode)) {
supported_grps->insert(g.first);
}
}
}
// /sys/bus/event_source/devices/<hw block>_<instance>/type
Event::Event(rsmi_event_type_t event, uint32_t dev_ind) :
event_type_(event), fd_(-1), prev_cntr_val_(0) {
rsmi_event_group_t grp = EvtGrpFromEvtID(event);
assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now
evt_path_root_ = kPathDeviceEventRoot;
evt_path_root_ += '/';
evt_path_root_ += kEvtGrpFNameMap.at(grp);
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
assert(dev_ind < smi.devices().size());
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dev_ind];
assert(dev != nullptr);
dev_ind_ = dev_ind;
dev_file_ind_ = dev->index();
std::replace(evt_path_root_.begin(), evt_path_root_.end(), '#',
static_cast<char>('0' + dev_file_ind_));
}
Event::~Event(void) {
int ret;
if (fd_ != -1) {
ret = close(fd_);
if (ret == -1) {
perror("Failed to close file descriptor.");
}
}
}
static void
parse_field_config(std::string fstr, evnt_info_t *val) {
std::stringstream ss(fstr);
std::stringstream fs;
std::string config_ln;
std::string field_name;
uint32_t start_bit;
uint32_t end_bit;
char jnk;
assert(val != nullptr);
getline(ss, config_ln, ':');
ss >> start_bit;
ss >> jnk;
assert(jnk == '-');
ss >> end_bit;
if (start_bit > end_bit ||
start_bit > 0xFF ||
end_bit > 0xFF ||
((end_bit - start_bit + 1) > 0xFF)) {
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__);
}
val->start_bit = static_cast<uint8_t>(start_bit);
val->field_size = static_cast<uint8_t>(end_bit - start_bit + 1);
}
static int32_t
get_event_bitfield_info(std::string *config_path, evnt_info_t *val) {
int32_t err;
std::string fstr;
err = ReadSysfsStr(*config_path, &fstr);
if (err) {
return err;
}
parse_field_config(fstr, val);
return 0;
}
int32_t
Event::get_event_file_info(void) {
int32_t err;
std::string fn = evt_path_root_;
std::string fstr;
fn += "/events/";
fn += kEventFNameMap.at(event_type_);
err = ReadSysfsStr(fn, &fstr);
if (err) {
return err;
}
// parse_perf_attr(fstr, &event_id_.event_field_vals);
std::stringstream ss(fstr);
std::stringstream fs;
std::string field_assgn;
std::string field_name;
evnt_info_t ev_info;
while (ss.rdbuf()->in_avail() != 0) {
ev_info = {};
getline(ss, field_assgn, ',');
fs.clear();
fs << field_assgn;
getline(fs, field_name, '=');
fs >> std::hex >> ev_info.value;
assert(fs.rdbuf()->in_avail() == 0);
// Now, get the corresponding bitfield
std::string config_path = evt_path_root_;
config_path += "/format/";
config_path += field_name;
err = get_event_bitfield_info(&config_path, &ev_info);
if (err) {
return err;
}
event_info_.push_back(ev_info);
}
return 0;
}
int32_t
Event::get_event_type(uint32_t *ev_type) {
assert(ev_type != nullptr);
if (ev_type == nullptr) {
return EINVAL;
}
std::string fn = evt_path_root_;
std::string fstr;
fn += "/type";
std::ifstream fs;
fs.open(fn);
if (!fs.is_open()) {
return errno;
}
fs >> *ev_type;
fs.close();
return 0;
}
static uint64_t
get_perf_attr_config(std::vector<evnt_info_t> *ev_info) {
uint64_t ret_val = 0;
assert(ev_info != nullptr);
for (const evnt_info_t& ev : *ev_info) {
ret_val |= ev.value << ev.start_bit;
}
return ret_val;
}
int32_t
amd::smi::evt::Event::openPerfHandle(void) {
int32_t ret;
memset(&attr_, 0, sizeof(struct perf_event_attr));
ret = get_event_file_info();
if (ret) {
return ret;
}
ret = get_event_type(&attr_.type);
if (ret) {
return ret;
}
attr_.size = sizeof(struct perf_event_attr);
attr_.config = get_perf_attr_config(&event_info_);
attr_.sample_type = PERF_SAMPLE_IDENTIFIER;
attr_.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING;
attr_.disabled = 1;
attr_.inherit = 1;
int64_t p_ret = syscall(__NR_perf_event_open, &attr_,
-1, 0, -1, PERF_FLAG_FD_NO_GROUP);
if (p_ret < 0) {
return errno;
}
fd_ = static_cast<int>(p_ret);
return 0;
}
int32_t
amd::smi::evt::Event::startCounter(void) {
int32_t ret;
if (fd_ == -1) {
ret = openPerfHandle();
if (ret != 0) {
return ret;
}
}
ret = ioctl(fd_, PERF_EVENT_IOC_ENABLE, NULL);
if (ret == -1) {
return errno;
}
assert(ret == 0); // We're expecting the ioctl call to return -1 or 0
return 0;
}
int32_t
amd::smi::evt::Event::stopCounter(void) {
int32_t ret;
if (fd_ == -1) {
return EBADF;
}
ret = ioctl(fd_, PERF_EVENT_IOC_DISABLE, NULL);
if (ret == -1) {
return errno;
}
assert(ret == 0); // We're expecting the ioctl call to return -1 or 0
return 0;
}
static ssize_t
readn(int fd, void *buf, size_t n) {
size_t left = n;
ssize_t bytes;
while (left) {
bytes = read(fd, buf, left);
if (!bytes) { /* reach EOF */
return static_cast<ssize_t>(n - left);
}
if (bytes < 0) {
if (errno == EINTR) {
/* read got interrupted */
continue;
}
return -errno;
}
left -= static_cast<size_t>(bytes);
buf = reinterpret_cast<void *>((reinterpret_cast<uint8_t *>(buf) + bytes));
}
return static_cast<ssize_t>(n);
}
uint32_t
amd::smi::evt::Event::getValue(rsmi_counter_value_t *val) {
assert(val != nullptr);
ssize_t ret;
perf_read_format_t pvalue;
ret = readn(fd_, &pvalue, sizeof(perf_read_format_t));
if (ret < 0) {
return static_cast<uint32_t>(-ret);
}
if (ret != sizeof(perf_read_format_t)) {
return EIO;
}
val->value = pvalue.value - prev_cntr_val_;
prev_cntr_val_ = pvalue.value;
val->time_enabled = pvalue.enabled_time;
val->time_running = pvalue.run_time;
return 0;
}
} // namespace evt
} // namespace smi
} // namespace amd
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+405
View File
@@ -0,0 +1,405 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <dirent.h>
#include <sys/stat.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_set>
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_io_link.h"
namespace amd {
namespace smi {
static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
static const char *kKFDLinkPath[] = {"io_links", "p2p_links"};
// IO Link Property strings
static const char *kIOLinkPropTYPEStr = "type";
// static const char *kIOLinkPropVERSION_MAJORStr = "version_major";
// static const char *kIOLinkPropVERSION_MINORStr = "version_minor";
static const char *kIOLinkPropNODE_FROMStr = "node_from";
static const char *kIOLinkPropNODE_TOStr = "node_to";
static const char *kIOLinkPropWEIGHTStr = "weight";
// static const char *kIOLinkPropMIN_LATENCYStr = "min_latency";
// static const char *kIOLinkPropMAX_LATENCYStr = "max_latency";
static const char *kIOLinkPropMIN_BANDWIDTHStr = "min_bandwidth";
static const char *kIOLinkPropMAX_BANDWIDTHStr = "max_bandwidth";
// static const char *kIOLinkPropRECOMMENDED_TRANSFER_SIZEStr =
// "recommended_transfer_size";
// static const char *kIOLinkPropFLAGSStr = "flags";
static bool is_number(const std::string &s) {
return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit);
}
static std::string LinkPathRoot(uint32_t node_indx,
LINK_DIRECTORY_TYPE directory) {
std::string link_path_root = kKFDNodesPathRoot;
link_path_root += '/';
link_path_root += std::to_string(node_indx);
link_path_root += '/';
if (directory < sizeof(kKFDLinkPath)/sizeof(kKFDLinkPath[0])) {
link_path_root += kKFDLinkPath[directory];
} else {
link_path_root = "";
}
return link_path_root;
}
static std::string LinkPath(uint32_t node_indx, uint32_t link_indx,
LINK_DIRECTORY_TYPE directory) {
std::string link_path = LinkPathRoot(node_indx, directory);
link_path += '/';
link_path += std::to_string(link_indx);
return link_path;
}
static int OpenLinkProperties(uint32_t node_indx, uint32_t link_indx,
std::ifstream *fs,
LINK_DIRECTORY_TYPE directory) {
int ret;
std::string f_path;
bool reg_file;
assert(fs != nullptr);
if (fs == nullptr) {
return EINVAL;
}
f_path = LinkPath(node_indx, link_indx, directory);
f_path += "/";
f_path += "properties";
ret = isRegularFile(f_path, &reg_file);
if (ret != 0) {
return ret;
}
if (!reg_file) {
return ENOENT;
}
fs->open(f_path);
if (!fs->is_open()) {
return errno;
}
return 0;
}
static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx,
std::vector<std::string> *retVec,
LINK_DIRECTORY_TYPE directory) {
std::string line;
int ret;
std::ifstream fs;
assert(retVec != nullptr);
if (retVec == nullptr) {
return EINVAL;
}
ret = OpenLinkProperties(node_indx, link_indx, &fs, directory);
if (ret) {
return ret;
}
while (std::getline(fs, line)) {
retVec->push_back(line);
}
if (retVec->empty()) {
fs.close();
return 0;
}
// Remove any *trailing* empty (whitespace) lines
while (retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
retVec->pop_back();
}
fs.close();
return 0;
}
static int DiscoverLinks(std::map<std::pair<uint32_t, uint32_t>,
std::shared_ptr<IOLink>> *links,
LINK_DIRECTORY_TYPE directory) {
assert(links != nullptr);
if (links == nullptr) {
return EINVAL;
}
assert(links->empty());
links->clear();
auto kfd_node_dir = opendir(kKFDNodesPathRoot);
if (kfd_node_dir == nullptr) {
std::string err_msg = "Failed to open KFD nodes directory ";
err_msg += kKFDNodesPathRoot;
err_msg += ".";
perror(err_msg.c_str());
return 1;
}
auto dentry_kfd = readdir(kfd_node_dir);
while (dentry_kfd != nullptr) {
if (dentry_kfd->d_name[0] == '.') {
dentry_kfd = readdir(kfd_node_dir);
continue;
}
if (!is_number(dentry_kfd->d_name)) {
dentry_kfd = readdir(kfd_node_dir);
continue;
}
uint32_t node_indx = static_cast<uint32_t>(std::stoi(dentry_kfd->d_name));
std::shared_ptr<IOLink> link;
uint32_t link_indx;
std::string link_path_root = LinkPathRoot(node_indx, directory);
auto io_link_dir = opendir(link_path_root.c_str());
assert(io_link_dir != nullptr);
auto dentry_io_link = readdir(io_link_dir);
while (dentry_io_link != nullptr) {
if (dentry_io_link->d_name[0] == '.') {
dentry_io_link = readdir(io_link_dir);
continue;
}
if (!is_number(dentry_io_link->d_name)) {
dentry_io_link = readdir(io_link_dir);
continue;
}
link_indx = static_cast<uint32_t>(std::stoi(dentry_io_link->d_name));
link = std::make_shared<IOLink>(node_indx, link_indx,
directory);
link->Initialize();
(*links)[std::make_pair(link->node_from(), link->node_to())] = link;
dentry_io_link = readdir(io_link_dir);
}
if (closedir(io_link_dir)) {
std::string err_msg = "Failed to close KFD nodes directory ";
err_msg += kKFDNodesPathRoot;
err_msg += ".";
perror(err_msg.c_str());
return 1;
}
dentry_kfd = readdir(kfd_node_dir);
}
if (closedir(kfd_node_dir)) {
return 1;
}
return 0;
}
int DiscoverIOLinks(std::map<std::pair<uint32_t, uint32_t>,
std::shared_ptr<IOLink>> *links) {
return DiscoverLinks(links, IO_LINK_DIRECTORY);
}
int DiscoverP2PLinks(std::map<std::pair<uint32_t, uint32_t>,
std::shared_ptr<IOLink>> *links) {
return DiscoverLinks(links, P2P_LINK_DIRECTORY);
}
static int DiscoverLinksPerNode(uint32_t node_indx, std::map<uint32_t,
std::shared_ptr<IOLink>> *links,
LINK_DIRECTORY_TYPE directory) {
assert(links != nullptr);
if (links == nullptr) {
return EINVAL;
}
assert(links->empty());
links->clear();
std::shared_ptr<IOLink> link;
uint32_t link_indx;
std::string link_path_root = LinkPathRoot(node_indx, directory);
auto io_link_dir = opendir(link_path_root.c_str());
assert(io_link_dir != nullptr);
auto dentry = readdir(io_link_dir);
while (dentry != nullptr) {
if (dentry->d_name[0] == '.') {
dentry = readdir(io_link_dir);
continue;
}
if (!is_number(dentry->d_name)) {
dentry = readdir(io_link_dir);
continue;
}
link_indx = static_cast<uint32_t>(std::stoi(dentry->d_name));
link = std::make_shared<IOLink>(node_indx, link_indx,
directory);
link->Initialize();
(*links)[link->node_to()] = link;
dentry = readdir(io_link_dir);
}
if (closedir(io_link_dir)) {
return 1;
}
return 0;
}
int DiscoverIOLinksPerNode(uint32_t node_indx, std::map<uint32_t,
std::shared_ptr<IOLink>> *links) {
return DiscoverLinksPerNode(node_indx, links, IO_LINK_DIRECTORY);
}
int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map<uint32_t,
std::shared_ptr<IOLink>> *links) {
return DiscoverLinksPerNode(node_indx, links, P2P_LINK_DIRECTORY);
}
IOLink::~IOLink() = default;
int IOLink::ReadProperties(void) {
int ret;
std::vector<std::string> propVec;
assert(properties_.empty());
if (!properties_.empty()) {
return 0;
}
ret = ReadLinkProperties(node_indx_, link_indx_, &propVec,
link_dir_type_);
if (ret) {
return ret;
}
std::string key_str;
uint64_t val_int; // Assume all properties are unsigned integers for now
std::istringstream fs;
for (const auto & i : propVec) {
fs.str(i);
fs >> key_str;
fs >> val_int;
properties_[key_str] = val_int;
fs.str("");
fs.clear();
}
return 0;
}
int
IOLink::Initialize(void) {
int ret = 0;
ret = ReadProperties();
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropTYPEStr,
reinterpret_cast<uint64_t *>(&type_));
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropNODE_FROMStr,
reinterpret_cast<uint64_t *>(&node_from_));
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropNODE_TOStr,
reinterpret_cast<uint64_t *>(&node_to_));
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropWEIGHTStr, &weight_);
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropMIN_BANDWIDTHStr, &min_bandwidth_);
if (ret) {return ret;}
ret = get_property_value(kIOLinkPropMAX_BANDWIDTHStr, &max_bandwidth_);
return ret;
}
int
IOLink::get_property_value(std::string property, uint64_t *value) {
assert(value != nullptr);
if (value == nullptr) {
return EINVAL;
}
if (properties_.find(property) == properties_.end()) {
return EINVAL;
}
*value = properties_[property];
return 0;
}
} // namespace smi
} // namespace amd
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,74 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <iostream>
#include "rocm_smi/rocm_smi_lib_loader.h"
namespace amd {
namespace smi {
ROCmSmiLibraryLoader::ROCmSmiLibraryLoader(): libHandler_(nullptr) {
}
rsmi_status_t ROCmSmiLibraryLoader::load(const char* filename) {
if (filename == nullptr) {
return RSMI_STATUS_FAIL_LOAD_MODULE;
}
if (libHandler_ || library_loaded_) {
unload();
}
std::lock_guard<std::mutex> guard(library_mutex_);
// check if already loaded, return success if it is
// dlopen(filename, RTLD_NOLOAD) == null only IFF library is not loaded
void* isLibOpen = dlopen(filename, RTLD_NOLOAD);
if (isLibOpen == nullptr) {
libHandler_ = dlopen(filename, RTLD_LAZY);
if (!libHandler_) {
char* error = dlerror();
std::cerr << "Fail to open " << filename <<": " << error
<< std::endl;
return RSMI_STATUS_FAIL_LOAD_MODULE;
}
}
library_loaded_ = true;
return RSMI_STATUS_SUCCESS;
}
rsmi_status_t ROCmSmiLibraryLoader::unload() {
std::lock_guard<std::mutex> guard(library_mutex_);
if (libHandler_) {
dlclose(libHandler_);
libHandler_ = nullptr;
library_loaded_ = false;
}
return RSMI_STATUS_SUCCESS;
}
ROCmSmiLibraryLoader::~ROCmSmiLibraryLoader() {
unload();
}
} // namespace smi
} // namespace amd
@@ -0,0 +1,530 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
* Detail Description:
* Implemented complete logging mechanism, supporting multiple logging type
* like as file based logging, console base logging etc. It also supported
* for different log types.
*
* Thread Safe logging mechanism. Compatible with G++ (Linux platform)
*
* Supported Log Type: ERROR, ALARM, ALWAYS, INFO, BUFFER, TRACE, DEBUG
* No control for ERROR, ALRAM and ALWAYS messages. These type of messages
* should be always captured -- IF logging is enabled.
*
* WARNING: Logging is controlled by users environment variable - RSMI_LOGGING.
* Enabling RSMI_LOGGING, by export RSMI_LOGGING=<any value>. No logs will
* be printed, unless RSMI_LOGGING is enabled.
*
* BUFFER log type should be use while logging raw buffer or raw messages
* Having direct interface as well as C++ Singleton iface. Can use
* whatever interface fits your needs.
*/
// C++ Header File(s)
#include <cstdlib>
#include <chrono>
#include <ctime>
#include <iomanip>
#include <iostream>
#include <sstream>
// Code Specific Header Files(s)
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_main.h"
ROCmLogging::Logger *ROCmLogging::Logger::m_Instance = nullptr;
// Log file name
// WARNING: File name should be changed here and
// pre/post install/remove/upgrade scripts. Changing
// in one place will cause a mismatch in these scripts,
// files may not have proper permissions, and logrotate
// would not function properly.
#define LOGPATH "/var/log/rocm_smi_lib/"
#define LOGBASE_FNAME "ROCm-SMI-lib"
#define LOGEXTENSION ".log"
const char *logFileName = LOGPATH LOGBASE_FNAME LOGEXTENSION;
ROCmLogging::Logger::Logger() {
initialize_resources();
}
ROCmLogging::Logger::~Logger() {
if (m_loggingIsOn) {
destroy_resources();
}
}
ROCmLogging::Logger* ROCmLogging::Logger::getInstance() throw() {
if (m_Instance == nullptr) {
m_Instance = new ROCmLogging::Logger();
}
return m_Instance;
}
void ROCmLogging::Logger::lock() {
m_Lock.lock();
}
void ROCmLogging::Logger::unlock() {
m_Lock.unlock();
}
void ROCmLogging::Logger::logIntoFile(std::string& data) {
lock();
if (!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessful."
<<" Unable to print the following message." << std::endl;
logOnConsole(data);
unlock();
return;
}
}
m_File << getCurrentTime() << " " << data << std::endl;
unlock();
}
void ROCmLogging::Logger::logOnConsole(std::string& data) {
std::cout << getCurrentTime() << " " << data << std::endl;
}
// Returns: In string format, YY-MM-DD HH:MM:SS.microseconds
std::string ROCmLogging::Logger::getCurrentTime(void) {
std::string currentTime;
// get current time
auto now = std::chrono::system_clock::now();
// get number of milliseconds for the current second
// (remainder after division into seconds)
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(
now.time_since_epoch()) % 1000000;
// convert to std::time_t in order to convert to std::tm (broken time)
auto timer = std::chrono::system_clock::to_time_t(now);
// convert to broken time
std::tm bt = *std::localtime(&timer);
std::ostringstream oss;
// YY-MM-DD HH:MM:SS.microseconds
oss << std::put_time(&bt, "%F %T");
oss << '.' << std::setfill('0') << std::setw(4) << ms.count();
currentTime = oss.str();
return currentTime;
}
// Interface for Error Log
void ROCmLogging::Logger::error(const char* text) throw() {
// By default, logging is disabled
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[ERROR]: ");
data.append(text);
// ERROR must be capture
if (m_LogType == FILE_LOG) {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::error(std::string& text) throw() {
error(text.data());
}
void ROCmLogging::Logger::error(std::ostringstream& stream) throw() {
std::string text = stream.str();
error(text.data());
stream.str("");
}
// Interface for Alarm Log
void ROCmLogging::Logger::alarm(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[ALARM]: ");
data.append(text);
// ALARM must be capture
if (m_LogType == FILE_LOG) {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::alarm(std::string& text) throw() {
alarm(text.data());
}
void ROCmLogging::Logger::alarm(std::ostringstream& stream) throw() {
std::string text = stream.str();
alarm(text.data());
stream.str("");
}
// Interface for Always Log
void ROCmLogging::Logger::always(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[ALWAYS]: ");
data.append(text);
// No check for ALWAYS logs
if (m_LogType == FILE_LOG) {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::always(std::string& text) throw() {
always(text.data());
}
void ROCmLogging::Logger::always(std::ostringstream& stream) throw() {
std::string text = stream.str();
always(text.data());
stream.str("");
}
// Interface for Buffer Log
void ROCmLogging::Logger::buffer(const char* text) throw() {
// Buffer is the special case. So don't add log level
// and timestamp in the buffer message. Just log the raw bytes.
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_BUFFER)) {
lock();
if (!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessful."
<<" Unable to print the following message." << std::endl;
std::string txtStr(text);
std::cout << txtStr << std::endl;
unlock();
return;
}
}
m_File << text << std::endl;
unlock();
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_BUFFER)) {
std::cout << text << std::endl;
}
}
void ROCmLogging::Logger::buffer(std::string& text) throw() {
buffer(text.data());
}
void ROCmLogging::Logger::buffer(std::ostringstream& stream) throw() {
std::string text = stream.str();
buffer(text.data());
stream.str("");
}
// Interface for Info Log
void ROCmLogging::Logger::info(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[INFO]: ");
data.append(text);
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_INFO)) {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_INFO)) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::info(std::string& text) throw() {
info(text.data());
}
void ROCmLogging::Logger::info(std::ostringstream& stream) throw() {
std::string text = stream.str();
info(text.data());
stream.str("");
}
// Interface for Trace Log
void ROCmLogging::Logger::trace(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[TRACE]: ");
data.append(text);
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_TRACE)) {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_TRACE)) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::trace(std::string& text) throw() {
trace(text.data());
}
void ROCmLogging::Logger::trace(std::ostringstream& stream) throw() {
std::string text = stream.str();
trace(text.data());
stream.str("");
}
// Interface for Debug Log
void ROCmLogging::Logger::debug(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (!m_loggingIsOn) {
return;
}
std::string data;
data.append("[DEBUG]: ");
data.append(text);
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_DEBUG)) {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_DEBUG)) {
logOnConsole(data);
logIntoFile(data);
}
}
void ROCmLogging::Logger::debug(std::string& text) throw() {
debug(text.data());
}
void ROCmLogging::Logger::debug(std::ostringstream& stream) throw() {
std::string text = stream.str();
debug(text.data());
stream.str("");
}
// Interfaces to control log levels
void ROCmLogging::Logger::updateLogLevel(LogLevel logLevel) {
m_LogLevel = logLevel;
}
void ROCmLogging::Logger::enableAllLogLevels() {
m_LogLevel = ENABLE_LOG;
}
// Disable all log levels, except error and alarm
void ROCmLogging::Logger::disableLog() {
m_LogLevel = DISABLE_LOG;
}
// Interfaces to control log Types
void ROCmLogging::Logger::updateLogType(LogType logType) {
m_LogType = logType;
}
void ROCmLogging::Logger::enableConsoleLogging() {
m_LogType = CONSOLE;
}
void ROCmLogging::Logger::enableFileLogging() {
m_LogType = FILE_LOG;
}
// Returns a string of details on current log settings
std::string ROCmLogging::Logger::getLogSettings() {
std::string logSettings;
if (m_File.is_open()) {
logSettings += "OpenStatus = File (" + std::string(logFileName)
+ ") is open";
} else {
logSettings += "OpenStatus = File (" + std::string(logFileName)
+ ") is not open";
}
logSettings += ", ";
switch (m_LogType) {
case NO_LOG:
logSettings += "LogType = NO_LOG";
break;
case FILE_LOG:
logSettings += "LogType = FILE_LOG";
break;
case CONSOLE:
logSettings += "LogType = CONSOLE";
break;
case BOTH_FILE_AND_CONSOLE:
logSettings += "LogType = BOTH_FILE_AND_CONSOLE";
break;
default:
logSettings += "LogType = <undefined>";
}
logSettings += ", ";
switch (m_LogLevel) {
case DISABLE_LOG:
logSettings += "LogLevel = DISABLE_LOG";
break;
case LOG_LEVEL_INFO:
logSettings += "LogLevel = LOG_LEVEL_INFO";
break;
case LOG_LEVEL_BUFFER:
logSettings += "LogLevel = LOG_LEVEL_BUFFER";
break;
case LOG_LEVEL_TRACE:
logSettings += "LogLevel = LOG_LEVEL_TRACE";
break;
case LOG_LEVEL_DEBUG:
logSettings += "LogLevel = LOG_LEVEL_DEBUG";
break;
case ENABLE_LOG:
logSettings += "LogLevel = ENABLE_LOG";
break;
default:
logSettings += "LogLevel = <undefined>";
}
return logSettings;
}
// Returns current reported enabled logging state. State is controlled by
// user's environment variable RSMI_LOGGING.
bool ROCmLogging::Logger::isLoggerEnabled() {
return m_loggingIsOn;
}
void ROCmLogging::Logger::initialize_resources() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn();
if (!m_loggingIsOn) {
return;
}
m_File.open(logFileName, std::ios::out | std::ios::app);
m_LogLevel = LOG_LEVEL_TRACE;
// RSMI_LOGGING = 1, output to logs only
// RSMI_LOGGING = 2, output to console only
// RSMI_LOGGING = 3, output to logs and console
switch (amd::smi::RocmSMI::getInstance().getLogSetting()) {
case 0:
m_LogType = NO_LOG;
break;
case 1:
m_LogType = FILE_LOG;
break;
case 2:
m_LogType = CONSOLE;
break;
case 3:
m_LogType = BOTH_FILE_AND_CONSOLE;
break;
default:
m_LogType = NO_LOG;
break;
}
if (!m_File.is_open()) {
std::cout << "WARNING: Issue opening log file (" << logFileName
<< ") to write." << std::endl;
}
if (m_File.fail()) {
std::cout << "WARNING: Failed opening log file." << std::endl;
}
chmod(logFileName, S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH);
}
void ROCmLogging::Logger::destroy_resources() {
m_File.close();
}
File diff suppressed because it is too large Load Diff
+663
View File
@@ -0,0 +1,663 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <dirent.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <map>
#include <regex> // NOLINT
#include <string>
#include <vector>
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_logger.h"
namespace amd {
namespace smi {
struct MonitorNameEntry {
MonitorTypes type;
const char *name;
};
static const char *kMonTempFName = "temp#_input";
static const char *kMonFanSpeedFName = "pwm#";
static const char *kMonMaxFanSpeedFName = "pwm#_max";
static const char *kMonFanRPMsName = "fan#_input";
static const char *kMonFanControlEnableName = "pwm#_enable";
static const char *kMonNameFName = "name";
static const char *kMonPowerCapDefaultName = "power#_cap_default";
static const char *kMonPowerCapName = "power#_cap";
static const char *kMonPowerCapMaxName = "power#_cap_max";
static const char *kMonPowerCapMinName = "power#_cap_min";
static const char *kMonPowerAveName = "power#_average";
static const char *kMonPowerInputName = "power#_input";
static const char *kMonPowerLabelName = "power#_label";
static const char *kMonTempMaxName = "temp#_max";
static const char *kMonTempMinName = "temp#_min";
static const char *kMonTempMaxHystName = "temp#_max_hyst";
static const char *kMonTempMinHystName = "temp#_min_hyst";
static const char *kMonTempCriticalName = "temp#_crit";
static const char *kMonTempCriticalHystName = "temp#_crit_hyst";
static const char *kMonTempEmergencyName = "temp#_emergency";
static const char *kMonTempEmergencyHystName = "temp#_emergency_hyst";
static const char *kMonTempCritMinName = "temp#_lcrit";
static const char *kMonTempCritMinHystName = "temp#_lcrit_hyst";
static const char *kMonTempOffsetName = "temp#_offset";
static const char *kMonTempLowestName = "temp#_lowest";
static const char *kMonTempHighestName = "temp#_highest";
static const char *kMonTempLabelName = "temp#_label";
static const char *kMonVoltFName = "in#_input";
static const char *kMonVoltMinName = "in#_min";
static const char *kMonVoltMinCritName = "in#_lcrit";
static const char *kMonVoltMaxName = "in#_max";
static const char *kMonVoltMaxCritName = "in#_crit";
static const char *kMonVoltAverageName = "in#_average";
static const char *kMonVoltLowestName = "in#_lowest";
static const char *kMonVoltHighestName = "in#_highest";
static const char *kMonVoltLabelName = "in#_label";
static const char *kTempSensorTypeMemoryName = "mem";
static const char *kTempSensorTypeJunctionName = "junction";
static const char *kTempSensorTypeEdgeName = "edge";
static const char *kTempSensorTypeVddgfxName = "vddgfx";
static const char *kTempSensorTypeVddboardName = "vddboard";
static const std::map<std::string, rsmi_temperature_type_t>
kTempSensorNameMap = {
{kTempSensorTypeMemoryName, RSMI_TEMP_TYPE_MEMORY},
{kTempSensorTypeJunctionName, RSMI_TEMP_TYPE_JUNCTION},
{kTempSensorTypeEdgeName, RSMI_TEMP_TYPE_EDGE},
};
static const std::map<std::string, rsmi_voltage_type_t>
kVoltSensorNameMap = {
{kTempSensorTypeVddgfxName, RSMI_VOLT_TYPE_VDDGFX},
{kTempSensorTypeVddboardName, RSMI_VOLT_TYPE_VDDBOARD},
};
static const std::map<MonitorTypes, const char *> kMonitorNameMap = {
{kMonName, kMonNameFName},
{kMonTemp, kMonTempFName},
{kMonFanSpeed, kMonFanSpeedFName},
{kMonFanCntrlEnable, kMonFanControlEnableName},
{kMonMaxFanSpeed, kMonMaxFanSpeedFName},
{kMonFanRPMs, kMonFanRPMsName},
{kMonPowerCap, kMonPowerCapName},
{kMonPowerCapDefault, kMonPowerCapDefaultName},
{kMonPowerCapMax, kMonPowerCapMaxName},
{kMonPowerCapMin, kMonPowerCapMinName},
{kMonPowerAve, kMonPowerAveName},
{kMonPowerInput, kMonPowerInputName},
{kMonPowerLabel, kMonPowerLabelName},
{kMonTempMax, kMonTempMaxName},
{kMonTempMin, kMonTempMinName},
{kMonTempMaxHyst, kMonTempMaxHystName},
{kMonTempMinHyst, kMonTempMinHystName},
{kMonTempCritical, kMonTempCriticalName},
{kMonTempCriticalHyst, kMonTempCriticalHystName},
{kMonTempEmergency, kMonTempEmergencyName},
{kMonTempEmergencyHyst, kMonTempEmergencyHystName},
{kMonTempCritMin, kMonTempCritMinName},
{kMonTempCritMinHyst, kMonTempCritMinHystName},
{kMonTempOffset, kMonTempOffsetName},
{kMonTempLowest, kMonTempLowestName},
{kMonTempHighest, kMonTempHighestName},
{kMonTempLabel, kMonTempLabelName},
{kMonVolt, kMonVoltFName},
{kMonVoltMin, kMonVoltMinName},
{kMonVoltMinCrit, kMonVoltMinCritName},
{kMonVoltMax, kMonVoltMaxName},
{kMonVoltMaxCrit, kMonVoltMaxCritName},
{kMonVoltAverage, kMonVoltAverageName},
{kMonVoltLowest, kMonVoltLowestName},
{kMonVoltHighest, kMonVoltHighestName},
{kMonVoltLabel, kMonVoltLabelName},
};
static std::map<MonitorTypes, uint64_t> kMonInfoVarTypeToRSMIVariant = {
// rsmi_temperature_metric_t
{kMonTemp, RSMI_TEMP_CURRENT},
{kMonTempMax, RSMI_TEMP_MAX},
{kMonTempMin, RSMI_TEMP_MIN},
{kMonTempMaxHyst, RSMI_TEMP_MAX_HYST},
{kMonTempMinHyst, RSMI_TEMP_MIN_HYST},
{kMonTempCritical, RSMI_TEMP_CRITICAL},
{kMonTempCriticalHyst, RSMI_TEMP_CRITICAL_HYST},
{kMonTempEmergency, RSMI_TEMP_EMERGENCY},
{kMonTempEmergencyHyst, RSMI_TEMP_EMERGENCY_HYST},
{kMonTempCritMin, RSMI_TEMP_CRIT_MIN},
{kMonTempCritMinHyst, RSMI_TEMP_CRIT_MIN_HYST},
{kMonTempOffset, RSMI_TEMP_OFFSET},
{kMonTempLowest, RSMI_TEMP_LOWEST},
{kMonTempHighest, RSMI_TEMP_HIGHEST},
{kMonInvalid, RSMI_DEFAULT_VARIANT},
// rsmi_voltage_metric_t
{kMonVolt, RSMI_VOLT_CURRENT},
{kMonVoltMin, RSMI_VOLT_MIN},
{kMonVoltMinCrit, RSMI_VOLT_MIN_CRIT},
{kMonVoltMax, RSMI_VOLT_MAX},
{kMonVoltMaxCrit, RSMI_VOLT_MAX_CRIT},
{kMonVoltAverage, RSMI_VOLT_AVERAGE},
{kMonVoltLowest, RSMI_VOLT_LOWEST},
{kMonVoltHighest, RSMI_VOLT_HIGHEST},
};
typedef struct {
std::vector<const char *> mandatory_depends;
std::vector<MonitorTypes> variants;
} monitor_depends_t;
static const std::map<const char *, monitor_depends_t> kMonFuncDependsMap = {
{"rsmi_dev_power_ave_get", { .mandatory_depends = {kMonPowerAveName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_power_cap_get", { .mandatory_depends = {kMonPowerCapName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_power_cap_default_get", { .mandatory_depends =
{kMonPowerCapDefaultName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_power_cap_range_get", { .mandatory_depends =
{kMonPowerCapMaxName,
kMonPowerCapMinName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_power_cap_set", { .mandatory_depends =
{kMonPowerCapMaxName,
kMonPowerCapMinName,
kMonPowerCapName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_fan_rpms_get", { .mandatory_depends = {kMonFanRPMsName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_fan_speed_get", { .mandatory_depends = {kMonFanSpeedFName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_fan_speed_max_get", { .mandatory_depends =
{kMonMaxFanSpeedFName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_temp_metric_get", { .mandatory_depends =
{kMonTempLabelName},
.variants = {kMonTemp,
kMonTempMax,
kMonTempMin,
kMonTempMaxHyst,
kMonTempMinHyst,
kMonTempCritical,
kMonTempCriticalHyst,
kMonTempEmergency,
kMonTempEmergencyHyst,
kMonTempCritMin,
kMonTempCritMinHyst,
kMonTempOffset,
kMonTempLowest,
kMonTempHighest,
},
}
},
{"rsmi_dev_fan_reset", { .mandatory_depends =
{kMonFanControlEnableName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_fan_speed_set", { .mandatory_depends =
{kMonMaxFanSpeedFName,
kMonFanControlEnableName,
kMonFanSpeedFName},
.variants = {kMonInvalid},
}
},
{"rsmi_dev_volt_metric_get", { .mandatory_depends =
{kMonVoltLabelName},
.variants = {kMonVolt,
kMonVoltMin,
kMonVoltMinCrit,
kMonVoltMax,
kMonVoltMaxCrit,
kMonVoltAverage,
kMonVoltLowest,
kMonVoltHighest,
},
}
},
};
Monitor::Monitor(std::string path, RocmSMI_env_vars const *e) :
path_(path), env_(e) {
#ifndef DEBUG
env_ = nullptr;
#endif
}
Monitor::~Monitor(void) = default;
std::string
Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) {
std::string tempPath = path_;
std::string fn = kMonitorNameMap.at(type);
std::replace(fn.begin(), fn.end(), '#', static_cast<char>('0' + sensor_id));
tempPath += "/";
tempPath += fn;
return tempPath;
}
int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id,
std::string val) {
std::string sysfs_path = MakeMonitorPath(type, sensor_id);
DBG_FILE_ERROR(sysfs_path, &val)
return WriteSysfsStr(sysfs_path, val);
}
// This string version should work for all valid monitor types
int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
std::string *val) {
std::ostringstream ss;
assert(val != nullptr);
std::string temp_str;
std::string sysfs_path = MakeMonitorPath(type, sensor_id);
DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr)
int ret = ReadSysfsStr(sysfs_path, val);
ss << __PRETTY_FUNCTION__
<< " | Success | Read hwmon file: " << sysfs_path
<< " | Type: " << monitorTypesToString.at(type)
<< " | Sensor id: " << std::to_string(sensor_id)
<< " | Data: " << *val
<< " | Returning: " << std::to_string(ret) << " |";
LOG_INFO(ss);
return ret;
}
int32_t
Monitor::setTempSensorLabelMap(void) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
std::string type_str;
int ret;
if (!temp_type_index_map_.empty()) {
return 0; // We've already filled in the map
}
auto add_temp_sensor_entry = [&](uint32_t file_index) {
ret = readMonitor(kMonTempLabel, file_index, &type_str);
rsmi_temperature_type_t t_type;
// If readMonitor fails, there is no label file for the file_index.
// In that case, map the type to file index 0, which is not supported
// and will fail appropriately later when we check for support.
if (ret) {
index_temp_type_map_.insert({file_index, RSMI_TEMP_TYPE_INVALID});
} else {
t_type = kTempSensorNameMap.at(type_str);
temp_type_index_map_[t_type] = file_index;
index_temp_type_map_.insert({file_index, t_type});
}
return 0;
};
for (uint32_t t = RSMI_TEMP_TYPE_FIRST; t <= RSMI_TEMP_TYPE_LAST; ++t) {
temp_type_index_map_.insert(
{static_cast<rsmi_temperature_type_t>(t), RSMI_TEMP_TYPE_INVALID});
}
for (uint32_t i = 1; i <= RSMI_TEMP_TYPE_LAST + 1; ++i) {
ret = add_temp_sensor_entry(i);
if (ret) {
return ret;
}
}
return 0;
}
int32_t
Monitor::setVoltSensorLabelMap(void) {
std::string type_str;
int ret;
if (!volt_type_index_map_.empty()) {
return 0; // We've already filled in the map
}
auto add_volt_sensor_entry = [&](uint32_t file_index) {
ret = readMonitor(kMonVoltLabel, file_index, &type_str);
rsmi_voltage_type_t t_type;
// If readMonitor fails, there is no label file for the file_index.
// In that case, map the type to file index 0, which is not supported
// and will fail appropriately later when we check for support.
if (ret) {
index_volt_type_map_.insert({file_index, RSMI_VOLT_TYPE_INVALID});
} else {
t_type = kVoltSensorNameMap.at(type_str);
volt_type_index_map_[t_type] = file_index;
index_volt_type_map_.insert({file_index, t_type});
}
return 0;
};
for (uint32_t i = 0; i < RSMI_VOLT_TYPE_LAST + 1; ++i) {
// VDDGFX -> 0, VDDNB -> 1, VDDBOARD -> 2
// Here the VDDNB will be skipped as it is not defined in the enum and not supported by AMD.
auto file_index = i;
if (i >= RSMI_VOLT_TYPE_VDDBOARD) {
file_index = i + 1;
}
ret = add_volt_sensor_entry(file_index);
if (ret) {
return ret;
}
}
return 0;
}
static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex,
std::vector<uint64_t> *sensors) {
auto hwmon_dir = opendir(dir_path.c_str());
assert(hwmon_dir != nullptr);
assert(sensors != nullptr);
sensors->clear();
std::string::size_type pos = fn_reg_ex.find('#');
if (pos == std::string::npos) {
closedir(hwmon_dir);
return -1;
}
fn_reg_ex.erase(pos, 1);
fn_reg_ex.insert(pos, "([0-9]+)");
fn_reg_ex = "\\b" + fn_reg_ex + "\\b";
auto dentry = readdir(hwmon_dir);
std::smatch match;
uint64_t mon_val;
char *endptr;
try {
std::regex re(fn_reg_ex);
std::string fn;
while (dentry != nullptr) {
fn = dentry->d_name;
if (std::regex_search(fn, match, re)) {
assert(match.size() == 2); // 1 for whole match + 1 for sub-match
errno = 0;
std::string val_str(match.str(1));
mon_val = strtoul(val_str.c_str(), &endptr, 10);
assert(errno == 0);
assert(*endptr == '\0');
if (errno) {
closedir(hwmon_dir);
return -2;
}
sensors->push_back(mon_val);
}
dentry = readdir(hwmon_dir);
}
if (closedir(hwmon_dir)) {
return errno;
}
} catch (std::regex_error& e) {
std::cout << "Regular expression error:" << std::endl;
std::cout << e.what() << std::endl;
std::cout << "Regex error code: " << e.code() << std::endl;
return -3;
}
return 0;
}
uint32_t
Monitor::getTempSensorIndex(rsmi_temperature_type_t type) {
return temp_type_index_map_.at(type);
}
rsmi_temperature_type_t
Monitor::getTempSensorEnum(uint64_t ind) {
return index_temp_type_map_.at(ind);
}
uint32_t
Monitor::getVoltSensorIndex(rsmi_voltage_type_t type) {
return volt_type_index_map_.at(type);
}
rsmi_voltage_type_t
Monitor::getVoltSensorEnum(uint64_t ind) {
return index_volt_type_map_.at(ind);
}
static std::vector<uint64_t> get_intersection(std::vector<uint64_t> *v1,
std::vector<uint64_t> *v2) {
assert(v1 != nullptr);
assert(v2 != nullptr);
std::vector<uint64_t> intersect;
std::sort(v1->begin(), v1->end());
std::sort(v2->begin(), v2->end());
std::set_intersection(v1->begin(), v1->end(), v2->begin(), v2->end(),
std::back_inserter(intersect));
return intersect;
}
// Use this enum to encode the monitor type into the monitor ID.
// We can later use this to convert to rsmi-api sensor types; for exampple,
// rsmi_temperature_type_t, which is what the caller will expect. Add
// new types as needed.
typedef enum {
eDefaultMonitor = 0,
eTempMonitor,
eVoltMonitor,
} monitor_types;
static monitor_types getFuncType(std::string f_name) {
monitor_types ret = eDefaultMonitor;
if (f_name == "rsmi_dev_temp_metric_get") {
ret = eTempMonitor;
}
if (f_name == "rsmi_dev_volt_metric_get") {
ret = eVoltMonitor;
}
return ret;
}
void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
std::map<const char *, monitor_depends_t>::const_iterator it =
kMonFuncDependsMap.begin();
std::string mon_root = path_;
bool mand_depends_met;
std::shared_ptr<VariantMap> supported_variants;
std::vector<uint64_t> sensors_i;
std::vector<uint64_t> intersect;
int ret;
monitor_types m_type;
assert(supported_funcs != nullptr);
while (it != kMonFuncDependsMap.end()) {
// First, see if all the mandatory dependencies are there
std::vector<const char *>::const_iterator dep =
it->second.mandatory_depends.begin();
m_type = getFuncType(it->first);
mand_depends_met = true;
// Initialize "intersect". A monitor is considered supported if all of its
// dependency monitors with the same sensor index are present. So we
// initialize "intersect" with the set of sensors that exist for the first
// mandatory monitor, and take intersection of that with the subsequent
// dependency monitors. The main assumption here is that
// variant_<sensor_i>'s sensor-based dependencies have the same index i;
// in other words, variant_i is not dependent on a sensor j, j != i
// Initialize intersect with the available monitors for the first
// mandatory dependency.
ret = get_supported_sensors(mon_root + "/", *dep, &intersect);
std::string dep_path;
if (ret == -1) {
// In this case, the dependency is not sensor-specific, so just
// see if the file exists.
dep_path = mon_root + "/" + *dep;
if (!FileExists(dep_path.c_str())) {
mand_depends_met = false;
}
} else if (ret <= -2) {
throw amd::smi::rsmi_exception(RSMI_STATUS_INTERNAL_EXCEPTION,
"Failed to parse monitor file name: " + dep_path);
}
dep++;
while (mand_depends_met && dep != it->second.mandatory_depends.end()) {
ret = get_supported_sensors(mon_root + "/", *dep, &sensors_i);
if (ret == 0) {
intersect = get_intersection(&sensors_i, &intersect);
} else if (ret == -1) {
// In this case, the dependency is not sensor-specific, so just
// see if the file exists.
std::string dep_path = mon_root + "/" + *dep;
if (!FileExists(dep_path.c_str())) {
mand_depends_met = false;
break;
}
} else if (ret <= -2) {
throw amd::smi::rsmi_exception(RSMI_STATUS_INTERNAL_EXCEPTION,
"Failed to parse monitor file name: " + dep_path);
}
dep++;
}
if (!mand_depends_met) {
it++;
continue;
}
// "intersect" holds the set of sensors for the mandatory dependencies
// that exist.
std::vector<MonitorTypes>::const_iterator var =
it->second.variants.begin();
supported_variants = std::make_shared<VariantMap>();
std::vector<uint64_t> supported_monitors;
for (; var != it->second.variants.end(); var++) {
if (*var != kMonInvalid) {
ret = get_supported_sensors(mon_root + "/",
kMonitorNameMap.at(*var), &sensors_i);
if (ret == 0) {
supported_monitors = get_intersection(&sensors_i, &intersect);
} else if (ret <= -2) {
throw amd::smi::rsmi_exception(RSMI_STATUS_INTERNAL_EXCEPTION,
"Failed to parse monitor file name: " + dep_path);
}
} else {
supported_monitors = intersect;
}
if (!supported_monitors.empty()) {
for (uint64_t &supported_monitor : supported_monitors) {
if (m_type == eDefaultMonitor) {
assert(supported_monitor > 0);
supported_monitor |=
(supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION;
} else if (m_type == eTempMonitor) {
// Temp sensor file names are 1-based
assert(supported_monitor > 0);
supported_monitor |=
static_cast<uint64_t>(getTempSensorEnum(supported_monitor))
<< MONITOR_TYPE_BIT_POSITION;
} else if (m_type == eVoltMonitor) {
// Voltage sensor file names are 0-based
supported_monitor |=
static_cast<uint64_t>(getVoltSensorEnum(supported_monitor))
<< MONITOR_TYPE_BIT_POSITION;
} else {
assert(false); // Unexpected monitor type
}
}
(*supported_variants)[kMonInfoVarTypeToRSMIVariant.at(*var)] =
std::make_shared<SubVariant>(supported_monitors);
}
}
if (it->second.variants.empty()) {
(*supported_funcs)[it->first] = nullptr;
supported_variants = nullptr; // Invoke destructor
} else if (!(*supported_variants).empty()) {
(*supported_funcs)[it->first] = supported_variants;
}
it++;
}
}
} // namespace smi
} // namespace amd
+154
View File
@@ -0,0 +1,154 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_exception.h"
namespace amd {
namespace smi {
static const char *kPowerMonPMName = "amdgpu_pm_info";
// Using this map in case we add other files from dri directory to parse.
static const std::map<PowerMonTypes, const char *> kMonitorNameMap = {
{kPowerMaxGPUPower, kPowerMonPMName},
};
PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) :
path_(path), env_(e) {
}
PowerMon::~PowerMon(void) = default;
static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) {
std::stringstream ss(s);
std::string ln;
std::string search_str;
assert(val != nullptr);
switch (type) {
case kPowerMaxGPUPower:
search_str = "(max GPU)";
break;
default:
assert(false); // Invalid search Power type requested
return EINVAL;
}
bool found = false;
while (std::getline(ss, ln)) {
if (ln.rfind(search_str) != std::string::npos) {
found = true;
break;
}
}
if (!found) {
return EPERM;
}
ss.clear();
std::stringstream l_ss;
l_ss << ln;
double num_units;
std::string sz;
switch (type) {
case kPowerMaxGPUPower:
l_ss >> num_units;
l_ss >> sz;
assert(sz == "W"); // We only expect Watts at this time
if (sz != "W") {
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA,
__FUNCTION__);
}
if (num_units > static_cast<long double>(0xFFFFFFFFFFFFFFFF)/1000) {
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA,
__FUNCTION__);
}
*val = static_cast<uint64_t>(num_units * 1000); // Convert W to mW
break;
default:
assert(false); // Invalid search Power type requested
return EINVAL;
}
ss.clear();
return 0;
}
int PowerMon::readPowerValue(PowerMonTypes type, uint64_t *power) {
auto tempPath = path_;
std::string fstr;
assert(power != nullptr);
tempPath += "/";
tempPath += kMonitorNameMap.at(type);
DBG_FILE_ERROR(tempPath, (std::string *)nullptr)
int ret = ReadSysfsStr(tempPath, &fstr);
if (ret) {
return ret;
}
return parse_power_str(fstr, type, power);
}
} // namespace smi
} // namespace amd
@@ -0,0 +1,567 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "rocm_smi/rocm_smi_properties.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_logger.h"
#include <algorithm>
#include <cassert>
#include <sstream>
//
// Property reinforcement check list
//
// NOTE: This is a *temporary solution* until we get a better approach, likely
// a driver API that can give us the capabilities of a GPU in question.
//
namespace amd {
namespace smi {
const AMDGpuOpModeList_t amdgpu_opmode_check_list {
{AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"},
{AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"},
{AMDGpuPropertyOpModeTypes_t::kBoth, "Both"},
};
const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list {
{AMDGpuPropertyTypesOffset_t::kNone, "None"},
{AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"},
{AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"},
{AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"},
{AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"},
{AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"},
};
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) {
return (static_cast<AMDGpuPropertyOffsetType>(type_offset) | (property_id));
}
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) {
const auto property_type_offset_mask =
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kMonitorTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kPerfTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kClkTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes);
auto property_type_id = (static_cast<AMDGpuPropertyOffsetType>(property_id) & ~(property_type_offset_mask));
return property_type_id;
}
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) | static_cast<AMDGpuPropertyOffsetType>(rhs));
}
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) & static_cast<AMDGpuPropertyOffsetType>(rhs));
}
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) | static_cast<AMDGpuPropertyOpModeType>(rhs));
}
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) & static_cast<AMDGpuPropertyOpModeType>(rhs));
}
//
// Note: Due to the fact that we have different enum elements with the same
// number, keying a hash by the number is not an option; ie:
// - DevInfoTypes::kDevVendorID = 7
// - MonitorTypes::kMonPowerCapDefault = 7
// So, we are keying it by a unique key, based on their info types
//
const AMDGpuVerbList_t amdgpu_verb_check_list {
{ AMDGpuVerbTypes_t::kNone, "None" },
{ AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" },
{ AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" },
{ AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" },
{ AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" },
{ AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" },
{ AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" },
{ AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" },
{ AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" },
{ AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" },
{ AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" },
{ AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" },
{ AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" },
{ AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" },
{ AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" },
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" },
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" },
{ AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" },
{ AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" },
{ AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" },
{ AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" },
{ AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" },
{ AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" },
{ AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" }
};
const uint16_t kDevIDAll(0xFFFF);
const uint16_t kDevRevIDAll(0xFFFF);
const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
//
// {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}}
// DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set
// MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get;
// DevInfoTypes::kDevPowerProfileMode =
// rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set;
//
// AMD All Families
{kDevIDAll, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanCntrlEnable),
AMDGpuVerbTypes_t::kResetGpuFan,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
// AMD Instinct MI210
{0x740F, {0x02,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
// AMD MIxxx
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPCIEClk),
AMDGpuVerbTypes_t::kSetGpuPciBandwidth,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonPowerCapDefault),
AMDGpuVerbTypes_t::kSetPowerCap,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuClkRange,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuOdClkInfo,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuOdVoltInfo,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO),
AMDGpuVerbTypes_t::kSetGpuPerfLevelV1,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuPerfLevel,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM),
AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanSpeed),
AMDGpuVerbTypes_t::kSetGpuFanSpeed,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanCntrlEnable),
AMDGpuVerbTypes_t::kResetGpuFan,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes,
rsmi_clk_type::RSMI_CLK_TYPE_FIRST),
AMDGpuVerbTypes_t::kSetClkFreq,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kSetGpuOverdriveLevel,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanRPMs),
AMDGpuVerbTypes_t::kGetGpuFanRpms,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanSpeed),
AMDGpuVerbTypes_t::kGetGpuFanSpeed,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonMaxFanSpeed),
AMDGpuVerbTypes_t::kGetGpuFanSpeedMax,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes,
rsmi_voltage_metric_t::RSMI_VOLT_CURRENT),
AMDGpuVerbTypes_t::kGetGpuVoltMetric,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kGetGpuOverDriveLevel,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerODVoltage),
AMDGpuVerbTypes_t::kGetGpuOdVoltInfo,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerODVoltage),
AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
}
};
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code)
{
std::ostringstream osstream;
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n";
LOG_TRACE(osstream);
if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
return actual_error_code;
}
//
// For property reinforcement query, the possible return values are:
// RSMI_STATUS_SUCCESS:
// - Property found in the reinforcement table, and it *should exist*
// RSMI_STATUS_NOT_SUPPORTED:
// - Property found in the reinforcement table, and it *should not* exist
// RSMI_STATUS_NO_DATA:
// - Could not find the correct dev_id and dev_revision info to build the filter
// RSMI_STATUS_UNKNOWN_ERROR:
// - The results are initialized with that. If that is returned,
// likely the reinforcement table does not contain any entries/rules for the
// dev_id in question.
//
auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) {
switch (query_result) {
case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR):
case (rsmi_status_t::RSMI_STATUS_NO_DATA):
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
break;
case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED):
case (rsmi_status_t::RSMI_STATUS_SUCCESS):
return query_result;
break;
default:
return actual_error_code;
break;
}
};
///
GET_DEV_FROM_INDX
osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query ======="
<< " [query filters: ]"
<< " device: " << dv_ind
<< " property/verb: " << static_cast<AMDGpuVerbId_t>(verb_type) << amdgpu_verb_check_list.at(verb_type);
auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type);
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
<< " query result: " << reinforcement_query_result;
reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result);
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
<< " query result: " << reinforcement_query_result;
return reinforcement_query_result;
}
void dump_amdgpu_property_reinforcement_list()
{
std::ostringstream osstream;
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
if (!amdgpu_property_reinforcement_list.empty()) {
for (const auto& property : amdgpu_property_reinforcement_list) {
osstream << __PRETTY_FUNCTION__
<< " Asic ID: " << property.first
<< " Asic Rev.ID: " << property.second.m_pci_rev_id
<< " Property ID: " << property.second.m_property
<< " Verb ID : " << static_cast<AMDGpuVerbId_t>(property.second.m_verb_id)
<< " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id)
<< " OpMode: " << static_cast<AMDGpuOpModeType_t>(property.second.m_opmode)
<< " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode)
<< " Flag Avail.: " << property.second.m_should_be_available;
}
osstream << __PRETTY_FUNCTION__ << "| ======= end =======";
return;
}
osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty";
LOG_TRACE(osstream);
}
rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type)
{
std::ostringstream osstream;
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
auto amdgpu_property_query = [&]() {
AMDGpuPropertyQuery_t amdgpu_property_query_init{};
amdgpu_property_query_init.m_asic_id = 0;
amdgpu_property_query_init.m_pci_rev_id = 0;
amdgpu_property_query_init.m_dev_idx = dev_idx;
amdgpu_property_query_init.m_property = 0;
amdgpu_property_query_init.m_verb_id = verb_type;
return amdgpu_property_query_init;
}();
auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) {
auto tmp_amdgpu_query = amdgpu_query_validate;
auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS);
if (amdgpu_query_validate.m_asic_id == 0) {
id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id);
if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) {
id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id);
}
}
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS);
return tmp_amdgpu_query;
};
// If the original amdgpu_query is missing parts of the filter, such as;
// asic_id, revision_id, we try to retrieve them based on the dev_idx.
// the property we are searching for, *must be present* .
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(osstream);
bool is_proper_query(false);
// Generic filter for checking properties for all asics and revisions.
auto amdgpu_property_query_all_asics = amdgpu_property_query;
amdgpu_property_query_all_asics.m_asic_id = kDevIDAll;
amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll;
auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics);
// We found a generic entry for all asics and revisions
if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) {
return amdgpu_property_query_result;
}
// If no generic entry, then we query for specific asic and revision ids.
amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query);
if (!is_proper_query) {
rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA;
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
<< ", Missing Query Filters were not successfully retrieved: "
<< " [query filters: ]"
<< " device: " << dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " property: " << amdgpu_property_query.m_property
<< " verb: " << static_cast<AMDGpuVerbId_t>(amdgpu_property_query.m_verb_id)
<< " proper_query: " << is_proper_query
<< " error: " << rsmi_status;
LOG_TRACE(osstream);
return rsmi_status;
}
return run_amdgpu_property_reinforcement_query(amdgpu_property_query);
}
rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query)
{
std::ostringstream osstream;
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
auto contains = [](const uint16_t asic_id) {
return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end());
};
// Traverse through all values for a given key
osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
LOG_TRACE(osstream);
if (contains(amdgpu_property_query.m_asic_id)) {
osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n";
auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id);
auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id);
while (itr_begin != itr_end) {
// Still same key, and...
if (itr_begin->first == amdgpu_property_query.m_asic_id) {
osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n";
// Pci_rev_id matches the filter or ALL Revisions
if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) ||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n";
// Do we have the property we are looking for?
if (((amdgpu_property_query.m_property != 0) &&
(itr_begin->second.m_property == amdgpu_property_query.m_property)) ||
((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) &&
(itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) {
osstream << __PRETTY_FUNCTION__
<< " property found: " << itr_begin->second.m_property
<< " verb found: " << static_cast<AMDGpuVerbId_t>(itr_begin->second.m_verb_id)
<< " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id)
<< " should_be_available: " << itr_begin->second.m_should_be_available << "\n";
// and if we do, should we consider it available, or forcefully
// considered it unavailable
osstream << __PRETTY_FUNCTION__ << "| ======= validating ======="
<< ", Property found in the table for this device and flagged as *Not Available* : "
<< " [query filters: ]"
<< " device: " << amdgpu_property_query.m_dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id;
//
// The property is set in the reinforcement table to 'it should not be available'
if (!itr_begin->second.m_should_be_available) {
// If the property is found and set to not available
// (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED),
// it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS);
rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
osstream << __PRETTY_FUNCTION__
<< " should_be_available: " << itr_begin->second.m_should_be_available
<< " result: " << rsmi_status << "\n";
LOG_TRACE(osstream);
return rsmi_status;
}
//
// The property is set in the reinforcement table to 'it should be available'
rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS;
osstream << __PRETTY_FUNCTION__
<< " should_be_available: " << itr_begin->second.m_should_be_available
<< " result: " << rsmi_status << "\n";
LOG_TRACE(osstream);
return rsmi_status;
}
}
}
itr_begin++;
}
}
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
<< "Done searching for the Property in reinforcement table for this device: "
<< " device: " << amdgpu_property_query.m_dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " property id: " << amdgpu_property_query.m_property
<< " error: " << rsmi_status;
LOG_TRACE(osstream);
return rsmi_status;
}
} // namespace smi
} // namespace amd
File diff suppressed because it is too large Load Diff
+81
View File
@@ -0,0 +1,81 @@
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" CMake ROCm SMI test ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
if(WIN32)
message("rsmi library test suite is not supported on Windows platform")
return()
endif()
# Required Defines first:
option(INSTALL_GTEST "Install GTest (only useful if GTest is not already installed)" OFF)
message("")
message("Build Configuration:")
message("-----------BuildType: " ${BUILD_TYPE})
message("------------Compiler: " ${CMAKE_CXX_COMPILER})
message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("")
set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(RSMITST "rsmitst")
# Hack to find libraries after installation
# $ORIGIN is needed for libgtest.so
# /opt/rocm/share/rocm-smi/rsmitst_tests/../../../ = /opt/rocm
set(RSMITST_RPATH
"\$ORIGIN/../../../lib"
"\$ORIGIN")
# combine lists
set(CMAKE_INSTALL_RPATH
${CMAKE_INSTALL_RPATH}
${RSMITST_RPATH})
# Download and compile googletest
include(FetchContent)
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0)
FetchContent_MakeAvailable(googletest)
# Other source directories
aux_source_directory(${SRC_DIR}/functional functionalSources)
# Source files for building rocrtst
aux_source_directory(${SRC_DIR} rsmitstSources)
# Build rules
add_executable(${RSMITST} ${rsmitstSources} ${functionalSources})
# Header file include path
target_include_directories(${RSMITST} PUBLIC ${SRC_DIR}/..)
target_link_libraries(
${RSMITST}
PUBLIC ${ROCM_SMI_TARGET}
PUBLIC GTest::gtest_main
PUBLIC c
PUBLIC stdc++
PUBLIC pthread
PUBLIC dl)
# install tests
install(TARGETS ${RSMITST}
DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests
COMPONENT ${TESTS_COMPONENT})
install(FILES rsmitst.exclude
DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests
COMPONENT ${TESTS_COMPONENT})
# install googletest libraries with tests
install(TARGETS gtest gtest_main
DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests
COMPONENT ${TESTS_COMPONENT})

Some files were not shown because too many files have changed in this diff Show More